From ce250d57da19e1ac68f8b56eeb8714d295b5484d Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Tue, 21 Apr 2026 09:23:09 +0900
Subject: [PATCH 1/9] revert modular: changes break modular's purpose

---
 utils/modular_model_converter.py | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/utils/modular_model_converter.py b/utils/modular_model_converter.py
index 48dc46b8b593..5fd453816f54 100644
--- a/utils/modular_model_converter.py
+++ b/utils/modular_model_converter.py
@@ -1692,10 +1692,6 @@ class NewNameModel(LlamaModel):
         class_file_type = find_file_type(class_name, new_name)
         # In this case, we need to remove it from the dependencies and create a new import instead
         if class_file_type != file_type:
-            # image_processing_pil and image_processing must never depend on each other.
-            # When a PIL class needs an image_processing class, inline it instead of importing.
-            if file_type == "image_processing_pil" and class_file_type == "image_processing":
-                continue
             corrected_dependencies.remove(class_name)
             import_statement = f"from .{class_file_type}_{new_name} import {class_name}"
             new_imports[class_name] = cst.parse_statement(import_statement)
@@ -1748,14 +1744,7 @@ class node based on the inherited classes if needed. Also returns any new import
         # Remove all classes explicitly defined in modular from the dependencies. Otherwise, if a class is referenced
         # before its new modular definition, it may be wrongly imported from elsewhere as a dependency if it matches
         # another class from a modeling file after renaming, even though it would be added after anyway (leading to duplicates)
-        # Exception: for image_processing_pil files, image_processing modular classes must be inlined (not excluded),
-        # because these two files must never import from each other.
-        classes_to_exclude = set(modular_mapper.classes.keys())
-        if file_type == "image_processing_pil":
-            classes_to_exclude -= {
-                k for k in classes_to_exclude if find_file_type(k, model_name) == "image_processing"
-            }
-        new_node_dependencies -= classes_to_exclude
+        new_node_dependencies -= set(modular_mapper.classes.keys())
 
         # The node was modified -> look for all recursive dependencies of the new node
         all_dependencies_to_add = find_all_dependencies(
@@ -1790,9 +1779,7 @@ class node based on the inherited classes if needed. Also returns any new import
 
         relative_dependency_order = modular_mapper.compute_relative_order(all_dependencies_to_add)
         nodes_to_add = {
-            dep: (relative_dependency_order[dep], modular_mapper.global_nodes[dep])
-            for dep in all_dependencies_to_add
-            if dep not in file_to_update
+            dep: (relative_dependency_order[dep], mapper.global_nodes[dep]) for dep in all_dependencies_to_add
         }
 
     # Add the class node itself to the nodes to add

From 25b19dcb681b3b2b53fb051dd3082a4a00f8e1ec Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Tue, 21 Apr 2026 09:40:29 +0900
Subject: [PATCH 2/9] revert the changes to the modulars

---
 .../modular_conditional_detr.py               | 17 +----------
 .../models/deepseek_vl/modular_deepseek_vl.py | 12 +-------
 .../modular_deformable_detr.py                | 17 +----------
 .../efficientloftr/modular_efficientloftr.py  | 10 -------
 .../modular_ernie4_5_vl_moe.py                | 19 +++----------
 .../grounding_dino/modular_grounding_dino.py  | 16 -----------
 .../models/lightglue/modular_lightglue.py     | 13 +++------
 .../modular_llava_onevision.py                | 13 +--------
 .../models/mask2former/modular_mask2former.py | 28 -------------------
 .../paddleocr_vl/modular_paddleocr_vl.py      | 11 ++------
 .../models/rt_detr/modular_rt_detr.py         | 14 ----------
 .../models/segformer/modular_segformer.py     | 12 --------
 .../models/smolvlm/modular_smolvlm.py         | 18 +-----------
 .../video_llama_3/modular_video_llama_3.py    | 25 +++--------------
 14 files changed, 19 insertions(+), 206 deletions(-)

diff --git a/src/transformers/models/conditional_detr/modular_conditional_detr.py b/src/transformers/models/conditional_detr/modular_conditional_detr.py
index ffc1e78bee01..2205b85c5547 100644
--- a/src/transformers/models/conditional_detr/modular_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modular_conditional_detr.py
@@ -20,13 +20,12 @@
 from ...image_transforms import (
     center_to_corners_format,
 )
-from ...image_utils import AnnotationFormat
 from ...masking_utils import create_bidirectional_mask
 from ...modeling_outputs import (
     BaseModelOutput,
 )
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
-from ...processing_utils import ImagesKwargs, Unpack
+from ...processing_utils import Unpack
 from ...utils import (
     TensorType,
     TransformersKwargs,
@@ -66,20 +65,6 @@
 logger = logging.get_logger(__name__)
 
 
-class ConditionalDetrImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
-        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
-    do_convert_annotations (`bool`, *optional*, defaults to `True`):
-        Controls whether to convert the annotations to the format expected by the CONDITIONAL_DETR model. Converts the
-        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
-        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
-    """
-
-    format: str | AnnotationFormat
-    do_convert_annotations: bool
-
-
 class ConditionalDetrImageProcessor(DetrImageProcessor):
     def post_process_object_detection(
         self, outputs, threshold: float = 0.5, target_sizes: TensorType | list[tuple] = None, top_k: int = 100
diff --git a/src/transformers/models/deepseek_vl/modular_deepseek_vl.py b/src/transformers/models/deepseek_vl/modular_deepseek_vl.py
index a56da6f3fe0a..be955c6fd41e 100644
--- a/src/transformers/models/deepseek_vl/modular_deepseek_vl.py
+++ b/src/transformers/models/deepseek_vl/modular_deepseek_vl.py
@@ -20,7 +20,7 @@
 from ...configuration_utils import PreTrainedConfig
 from ...image_processing_utils import BatchFeature
 from ...image_utils import ImageInput
-from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import (
     PreTokenizedInput,
     TextInput,
@@ -152,16 +152,6 @@ def generate(self):
         raise AttributeError("Not needed for DeepseekVL")
 
 
-class DeepseekVLImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    min_size (`int`, *optional*, defaults to 14):
-        The minimum allowed size for the resized image. Ensures that neither the height nor width
-        falls below this value after resizing.
-    """
-
-    min_size: int
-
-
 class DeepseekVLImageProcessorPil(JanusImageProcessorPil):
     def postprocess(self):
         raise AttributeError("Not needed for DeepseekVL")
diff --git a/src/transformers/models/deformable_detr/modular_deformable_detr.py b/src/transformers/models/deformable_detr/modular_deformable_detr.py
index a2f80e8236ad..a4a5b4acd95a 100644
--- a/src/transformers/models/deformable_detr/modular_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/modular_deformable_detr.py
@@ -23,11 +23,10 @@
 from ... import initialization as init
 from ...backbone_utils import load_backbone
 from ...image_transforms import center_to_corners_format
-from ...image_utils import AnnotationFormat
 from ...integrations import use_kernel_forward_from_hub
 from ...modeling_outputs import BaseModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...processing_utils import ImagesKwargs, Unpack
+from ...processing_utils import Unpack
 from ...utils import (
     ModelOutput,
     TensorType,
@@ -61,20 +60,6 @@
 logger = logging.get_logger(__name__)
 
 
-class DeformableDetrImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
-        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
-    do_convert_annotations (`bool`, *optional*, defaults to `True`):
-        Controls whether to convert the annotations to the format expected by the DEFORMABLE_DETR model. Converts the
-        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
-        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
-    """
-
-    format: str | AnnotationFormat
-    do_convert_annotations: bool
-
-
 class DeformableDetrImageProcessor(DetrImageProcessor):
     def post_process_object_detection(
         self, outputs, threshold: float = 0.5, target_sizes: TensorType | list[tuple] = None, top_k: int = 100
diff --git a/src/transformers/models/efficientloftr/modular_efficientloftr.py b/src/transformers/models/efficientloftr/modular_efficientloftr.py
index 17e3e399a8df..86d8d34eba70 100644
--- a/src/transformers/models/efficientloftr/modular_efficientloftr.py
+++ b/src/transformers/models/efficientloftr/modular_efficientloftr.py
@@ -1,6 +1,5 @@
 from typing import TYPE_CHECKING
 
-from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, is_torch_available
 from ...utils.import_utils import requires
 from ..superglue.image_processing_pil_superglue import SuperGlueImageProcessorPil
@@ -14,15 +13,6 @@
     from .modeling_efficientloftr import EfficientLoFTRKeypointMatchingOutput
 
 
-class EfficientLoFTRImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    do_grayscale (`bool`, *optional*, defaults to `self.do_grayscale`):
-        Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
-    """
-
-    do_grayscale: bool
-
-
 class EfficientLoFTRImageProcessor(SuperGlueImageProcessor):
     def post_process_keypoint_matching(
         self,
diff --git a/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py b/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py
index 42bbb44b70a5..0e9f27d5c41d 100644
--- a/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py
+++ b/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py
@@ -43,7 +43,7 @@
 from ...modeling_outputs import BaseModelOutputWithPooling, MoeCausalLMOutputWithPast, MoeModelOutputWithPast
 from ...modeling_rope_utils import dynamic_rope_update
 from ...modeling_utils import PreTrainedModel
-from ...processing_utils import ImagesKwargs, Unpack
+from ...processing_utils import Unpack
 from ...utils import (
     TensorType,
     TransformersKwargs,
@@ -63,7 +63,7 @@
     Ernie4_5_MoeStatics,
     Ernie4_5_MoeTopKRouter,
 )
-from ..glm4v.image_processing_glm4v import Glm4vImageProcessor
+from ..glm4v.image_processing_glm4v import Glm4vImageProcessor, Glm4vImageProcessorKwargs
 from ..glm4v.image_processing_pil_glm4v import Glm4vImageProcessorPil
 from ..glm4v.modeling_glm4v import Glm4vForConditionalGeneration
 from ..mixtral.modeling_mixtral import load_balancing_loss_func
@@ -1220,19 +1220,8 @@ def forward(
         )
 
 
-class Ernie4_5_VLMoeImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    patch_size (`int`, *optional*, defaults to 14):
-        The spatial patch size of the vision encoder.
-    temporal_patch_size (`int`, *optional*):
-        The temporal patch size of the vision encoder. Unused in the image processor, only used for videos.
-    merge_size (`int`, *optional*, defaults to 2):
-        The merge size of the vision encoder to llm encoder.
-    """
-
-    patch_size: int
-    temporal_patch_size: int
-    merge_size: int
+class Ernie4_5_VLMoeImageProcessorKwargs(Glm4vImageProcessorKwargs):
+    pass
 
 
 class Ernie4_5_VLMoeImageProcessorPil(Glm4vImageProcessorPil):
diff --git a/src/transformers/models/grounding_dino/modular_grounding_dino.py b/src/transformers/models/grounding_dino/modular_grounding_dino.py
index 483ad262a602..bd35fd512ffe 100644
--- a/src/transformers/models/grounding_dino/modular_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modular_grounding_dino.py
@@ -25,8 +25,6 @@
 from transformers.models.detr.image_processing_pil_detr import DetrImageProcessorPil
 
 from ...image_transforms import center_to_corners_format
-from ...image_utils import AnnotationFormat
-from ...processing_utils import ImagesKwargs
 from ...utils import (
     TensorType,
     logging,
@@ -70,20 +68,6 @@ def _scale_boxes(boxes, target_sizes):
     return boxes
 
 
-class GroundingDinoImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
-        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
-    do_convert_annotations (`bool`, *optional*, defaults to `True`):
-        Controls whether to convert the annotations to the format expected by the GROUNDING_DINO model. Converts the
-        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
-        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
-    """
-
-    format: str | AnnotationFormat
-    do_convert_annotations: bool
-
-
 class GroundingDinoImageProcessor(DetrImageProcessor):
     def post_process_object_detection(
         self,
diff --git a/src/transformers/models/lightglue/modular_lightglue.py b/src/transformers/models/lightglue/modular_lightglue.py
index 62082b678b00..afc8a3efec25 100644
--- a/src/transformers/models/lightglue/modular_lightglue.py
+++ b/src/transformers/models/lightglue/modular_lightglue.py
@@ -23,7 +23,7 @@
 from ...configuration_utils import PreTrainedConfig
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
-from ...processing_utils import ImagesKwargs, Unpack
+from ...processing_utils import Unpack
 from ...utils import ModelOutput, TensorType, auto_docstring, can_return_tuple, logging
 from ...utils.import_utils import requires
 from ..auto import CONFIG_MAPPING, AutoConfig
@@ -32,7 +32,7 @@
 from ..cohere.modeling_cohere import apply_rotary_pos_emb
 from ..llama.modeling_llama import LlamaAttention, eager_attention_forward
 from ..superglue.image_processing_pil_superglue import SuperGlueImageProcessorPil
-from ..superglue.image_processing_superglue import SuperGlueImageProcessor
+from ..superglue.image_processing_superglue import SuperGlueImageProcessor, SuperGlueImageProcessorKwargs
 from ..superpoint import SuperPointConfig
 
 
@@ -154,13 +154,8 @@ class LightGlueKeypointMatchingOutput(ModelOutput):
     attentions: tuple[torch.FloatTensor] | None = None
 
 
-class LightGlueImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    do_grayscale (`bool`, *optional*, defaults to `self.do_grayscale`):
-        Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
-    """
-
-    do_grayscale: bool
+class LightGlueImageProcessorKwargs(SuperGlueImageProcessorKwargs):
+    pass
 
 
 class LightGlueImageProcessor(SuperGlueImageProcessor):
diff --git a/src/transformers/models/llava_onevision/modular_llava_onevision.py b/src/transformers/models/llava_onevision/modular_llava_onevision.py
index a3634aa17cba..f44a4612cdc2 100644
--- a/src/transformers/models/llava_onevision/modular_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/modular_llava_onevision.py
@@ -34,7 +34,7 @@
 )
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_outputs import BaseModelOutputWithPooling
-from ...processing_utils import ImagesKwargs, Unpack
+from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring, logging
 from ...utils.generic import can_return_tuple, merge_with_config_defaults
 from ..llava_next.image_processing_llava_next import LlavaNextImageProcessor, LlavaNextImageProcessorKwargs
@@ -217,17 +217,6 @@ def _preprocess(
         )
 
 
-class LlavaOnevisionImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    image_grid_pinpoints (`list[list[int]]`, *optional*):
-        A list of possible resolutions to use for processing high resolution images. The best resolution is selected
-        based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess`
-        method.
-    """
-
-    image_grid_pinpoints: list[list[int]]
-
-
 class LlavaOnevisionImageProcessorPil(LlavaNextImageProcessorPil):
     resample = PILImageResampling.BICUBIC
     image_mean = OPENAI_CLIP_MEAN
diff --git a/src/transformers/models/mask2former/modular_mask2former.py b/src/transformers/models/mask2former/modular_mask2former.py
index 87f2b834991f..089baffe5df7 100644
--- a/src/transformers/models/mask2former/modular_mask2former.py
+++ b/src/transformers/models/mask2former/modular_mask2former.py
@@ -15,8 +15,6 @@
 import torch
 from torch import nn
 
-from ...image_utils import SizeDict
-from ...processing_utils import ImagesKwargs
 from ...utils import (
     TensorType,
     logging,
@@ -35,32 +33,6 @@
 logger = logging.get_logger(__name__)
 
 
-class Mask2FormerImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    ignore_index (`int`, *optional*):
-        Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
-        denoted with 0 (background) will be replaced with `ignore_index`.
-    do_reduce_labels (`bool`, *optional*, defaults to `False`):
-        Whether or not to decrement all label values of segmentation maps by 1. Usually used for datasets where 0
-        is used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k).
-        The background label will be replaced by `ignore_index`.
-    num_labels (`int`, *optional*):
-        The number of labels in the segmentation map.
-    size_divisor (`int`, *optional*, defaults to `32`):
-        Some backbones need images divisible by a certain number. If not passed, it defaults to the value used in
-        Swin Transformer.
-    pad_size (`SizeDict`, *optional*):
-        The size to pad the images to. Must be larger than any image size provided for preprocessing. If `pad_size`
-        is not provided, images will be padded to the largest height and width in the batch.
-    """
-
-    ignore_index: int | None
-    do_reduce_labels: bool
-    num_labels: int | None
-    size_divisor: int
-    pad_size: SizeDict | None
-
-
 class Mask2FormerImageProcessor(MaskFormerImageProcessor):
     def post_process_semantic_segmentation(
         self, outputs, target_sizes: list[tuple[int, int]] | None = None
diff --git a/src/transformers/models/paddleocr_vl/modular_paddleocr_vl.py b/src/transformers/models/paddleocr_vl/modular_paddleocr_vl.py
index 20a897059a4e..02895d6e2576 100644
--- a/src/transformers/models/paddleocr_vl/modular_paddleocr_vl.py
+++ b/src/transformers/models/paddleocr_vl/modular_paddleocr_vl.py
@@ -38,9 +38,8 @@
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPast, BaseModelOutputWithPooling
 from ...modeling_utils import PreTrainedModel
 from ...models.qwen2_vl.image_processing_pil_qwen2_vl import Qwen2VLImageProcessorPil
-from ...models.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor
+from ...models.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor, Qwen2VLImageProcessorKwargs
 from ...processing_utils import (
-    ImagesKwargs,
     ProcessingKwargs,
     ProcessorMixin,
     Unpack,
@@ -123,7 +122,7 @@ def smart_resize(
     return h_bar, w_bar
 
 
-class PaddleOCRVLImageProcessorKwargs(ImagesKwargs, total=False):
+class PaddleOCRVLImageProcessorKwargs(Qwen2VLImageProcessorKwargs):
     r"""
     patch_size (`int`, *optional*, defaults to 14):
         The spatial patch size of the vision encoder.
@@ -133,12 +132,6 @@ class PaddleOCRVLImageProcessorKwargs(ImagesKwargs, total=False):
         The merge size of the vision encoder to llm encoder.
     """
 
-    min_pixels: int
-    max_pixels: int
-    patch_size: int
-    temporal_patch_size: int
-    merge_size: int
-
 
 class PaddleOCRVLImageProcessorPil(Qwen2VLImageProcessorPil):
     size = {"shortest_edge": 384 * 384, "longest_edge": 1536 * 1536}
diff --git a/src/transformers/models/rt_detr/modular_rt_detr.py b/src/transformers/models/rt_detr/modular_rt_detr.py
index 97136541d6ec..cd4e8faf3fc2 100644
--- a/src/transformers/models/rt_detr/modular_rt_detr.py
+++ b/src/transformers/models/rt_detr/modular_rt_detr.py
@@ -426,20 +426,6 @@ def post_process_panoptic_segmentation(self):
         raise NotImplementedError("Panoptic segmentation post-processing is not implemented for RT-DETR yet.")
 
 
-class RTDetrImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
-        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
-    do_convert_annotations (`bool`, *optional*, defaults to `True`):
-        Controls whether to convert the annotations to the format expected by the RT_DETR model. Converts the
-        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
-        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
-    """
-
-    format: str | AnnotationFormat
-    do_convert_annotations: bool
-
-
 @requires(backends=("torch",))
 class RTDetrImageProcessorPil(DetrImageProcessorPil):
     resample = PILImageResampling.BILINEAR
diff --git a/src/transformers/models/segformer/modular_segformer.py b/src/transformers/models/segformer/modular_segformer.py
index d7f339ea6e42..414dc58e8c52 100644
--- a/src/transformers/models/segformer/modular_segformer.py
+++ b/src/transformers/models/segformer/modular_segformer.py
@@ -31,22 +31,10 @@
     PILImageResampling,
     SizeDict,
 )
-from ...processing_utils import ImagesKwargs
 from ...utils import TensorType
 from ...utils.import_utils import requires
 
 
-class SegformerImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
-        Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
-        is used for background, and background itself is not included in all classes of a dataset (e.g.
-        ADE20k). The background label will be replaced by 255.
-    """
-
-    do_reduce_labels: bool
-
-
 class SegformerImageProcessor(BeitImageProcessor):
     resample = PILImageResampling.BILINEAR
     image_mean = IMAGENET_DEFAULT_MEAN
diff --git a/src/transformers/models/smolvlm/modular_smolvlm.py b/src/transformers/models/smolvlm/modular_smolvlm.py
index 9c572cc9d877..cf91863c56a7 100644
--- a/src/transformers/models/smolvlm/modular_smolvlm.py
+++ b/src/transformers/models/smolvlm/modular_smolvlm.py
@@ -22,7 +22,7 @@
 from ...generation import GenerationConfig
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_outputs import BaseModelOutputWithPooling
-from ...processing_utils import ImagesKwargs, Unpack
+from ...processing_utils import Unpack
 from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging, torch_compilable_check
 from ..idefics3.configuration_idefics3 import Idefics3Config, Idefics3VisionConfig
 from ..idefics3.image_processing_idefics3 import Idefics3ImageProcessor
@@ -91,22 +91,6 @@ class SmolVLMConfig(Idefics3Config):
     model_type = "smolvlm"
 
 
-class SmolVLMImageProcessorKwargs(ImagesKwargs, total=False):
-    """
-    do_image_splitting (`bool`, *optional*, defaults to `True`):
-        Whether to split the image into sub-images concatenated with the original image. They are split into patches
-        such that each patch has a size of `max_image_size["height"]` x `max_image_size["width"]`.
-    max_image_size (`Dict`, *optional*, defaults to `{"longest_edge": 364}`):
-        Maximum resolution of the patches of images accepted by the model. This is a dictionary containing the key "longest_edge".
-    return_row_col_info (`bool`, *optional*, defaults to `False`):
-        Whether to return the row and column information of the images.
-    """
-
-    do_image_splitting: bool
-    max_image_size: dict[str, int]
-    return_row_col_info: bool
-
-
 class SmolVLMImageProcessor(Idefics3ImageProcessor):
     pass
 
diff --git a/src/transformers/models/video_llama_3/modular_video_llama_3.py b/src/transformers/models/video_llama_3/modular_video_llama_3.py
index 4eef74580c87..c4a9e40bc8f0 100644
--- a/src/transformers/models/video_llama_3/modular_video_llama_3.py
+++ b/src/transformers/models/video_llama_3/modular_video_llama_3.py
@@ -37,7 +37,7 @@
 )
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ModelOutput
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
-from ...processing_utils import ImagesKwargs, Unpack
+from ...processing_utils import Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import (
     TensorType,
@@ -55,7 +55,7 @@
 from ..auto import CONFIG_MAPPING, AutoConfig
 from ..auto.modeling_auto import AutoModel
 from ..qwen2_vl.image_processing_pil_qwen2_vl import Qwen2VLImageProcessorPil
-from ..qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor, smart_resize
+from ..qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor, Qwen2VLImageProcessorKwargs, smart_resize
 from ..qwen2_vl.modeling_qwen2_vl import (
     Qwen2VLForConditionalGeneration,
     Qwen2VLModel,
@@ -1107,25 +1107,8 @@ def model_input_names(self):
         raise AttributeError("VideoLlama doesn't need to override it")
 
 
-class VideoLlama3ImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    min_pixels (`int`, *optional*, defaults to `56 * 56`):
-        The min pixels of the image to resize the image.
-    max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
-        The max pixels of the image to resize the image.
-    patch_size (`int`, *optional*, defaults to 14):
-        The spatial patch size of the vision encoder.
-    temporal_patch_size (`int`, *optional*, defaults to 2):
-        The temporal patch size of the vision encoder.
-    merge_size (`int`, *optional*, defaults to 2):
-        The merge size of the vision encoder to llm encoder.
-    """
-
-    min_pixels: int
-    max_pixels: int
-    patch_size: int
-    temporal_patch_size: int
-    merge_size: int
+class VideoLlama3ImageProcessorKwargs(Qwen2VLImageProcessorKwargs):
+    pass
 
 
 class VideoLlama3ImageProcessorPil(Qwen2VLImageProcessorPil):

From ef6419a6dd96620af5c58db50262e86287d1b6c8 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Tue, 21 Apr 2026 09:42:58 +0900
Subject: [PATCH 3/9] forgot docstring

---
 .../models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py    | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py b/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py
index 0e9f27d5c41d..ad47bc0508a3 100644
--- a/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py
+++ b/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py
@@ -1221,7 +1221,14 @@ def forward(
 
 
 class Ernie4_5_VLMoeImageProcessorKwargs(Glm4vImageProcessorKwargs):
-    pass
+    r"""
+    patch_size (`int`, *optional*, defaults to 14):
+        The spatial patch size of the vision encoder.
+    temporal_patch_size (`int`, *optional*):
+        The temporal patch size of the vision encoder. Unused in the image processor, only used for videos.
+    merge_size (`int`, *optional*, defaults to 2):
+        The merge size of the vision encoder to llm encoder.
+    """
 
 
 class Ernie4_5_VLMoeImageProcessorPil(Glm4vImageProcessorPil):

From 1c2343b2b5a57a077e65dba6094c1f2f365c893d Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Tue, 21 Apr 2026 09:59:45 +0900
Subject: [PATCH 4/9] oups

---
 utils/modular_model_converter.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/utils/modular_model_converter.py b/utils/modular_model_converter.py
index 5fd453816f54..d5dc7dfe23b6 100644
--- a/utils/modular_model_converter.py
+++ b/utils/modular_model_converter.py
@@ -1755,13 +1755,7 @@ class node based on the inherited classes if needed. Also returns any new import
 
         relative_dependency_order = mapper.compute_relative_order(all_dependencies_to_add)
         nodes_to_add = {
-            dep: (
-                relative_dependency_order[dep],
-                # If this dependency is explicitly defined in the modular, prefer the modular's version.
-                # This prevents a renamed parent class from overriding a modular-defined class of the same name.
-                modular_mapper.global_nodes[dep] if dep in modular_mapper.classes else mapper.global_nodes[dep],
-            )
-            for dep in all_dependencies_to_add
+            dep: (relative_dependency_order[dep], mapper.global_nodes[dep]) for dep in all_dependencies_to_add
         }
 
     # No transformers (modeling file) super class, just check functions and assignments dependencies
@@ -1779,7 +1773,9 @@ class node based on the inherited classes if needed. Also returns any new import
 
         relative_dependency_order = modular_mapper.compute_relative_order(all_dependencies_to_add)
         nodes_to_add = {
-            dep: (relative_dependency_order[dep], mapper.global_nodes[dep]) for dep in all_dependencies_to_add
+            dep: (relative_dependency_order[dep], modular_mapper.global_nodes[dep])
+            for dep in all_dependencies_to_add
+            if dep not in file_to_update
         }
 
     # Add the class node itself to the nodes to add

From 56c04dff404b45194920d012a5671ea6cfbb13d0 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Tue, 21 Apr 2026 10:02:57 +0900
Subject: [PATCH 5/9] apply modular converter

---
 src/transformers/dependency_versions_table.py |  1 +
 .../image_processing_pil_conditional_detr.py  | 18 ++---------
 .../image_processing_pil_deepseek_vl.py       | 13 ++------
 ...image_processing_pil_deepseek_vl_hybrid.py | 30 ++-----------------
 .../image_processing_pil_deformable_detr.py   | 18 ++---------
 .../image_processing_pil_efficientloftr.py    | 12 ++------
 .../image_processing_pil_ernie4_5_vl_moe.py   | 18 ++---------
 .../glm46v/image_processing_pil_glm46v.py     | 18 ++---------
 .../image_processing_pil_glm_image.py         | 25 ++--------------
 .../image_processing_pil_grounding_dino.py    | 18 ++---------
 .../image_processing_pil_lightglue.py         | 12 ++------
 .../image_processing_pil_llava_onevision.py   | 14 ++-------
 .../image_processing_pil_mask2former.py       | 29 ++----------------
 .../image_processing_pil_paddleocr_vl.py      | 20 ++-----------
 .../rt_detr/image_processing_pil_rt_detr.py   | 18 ++---------
 .../image_processing_pil_segformer.py         | 14 ++-------
 .../smolvlm/image_processing_pil_smolvlm.py   | 19 ++----------
 .../image_processing_pil_video_llama_3.py     | 24 ++-------------
 .../yolos/image_processing_pil_yolos.py       | 19 ++----------
 19 files changed, 37 insertions(+), 303 deletions(-)

diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index b08aa558d795..0456904dd3d5 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -56,6 +56,7 @@
     "rjieba": "rjieba",
     "rouge-score": "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1",
     "ruff": "ruff==0.14.10",
+    "transformers-mlinter": "transformers-mlinter @ git+https://github.com/huggingface/transformers-mlinter@b9d319ce264c106f97a959d926ef42bc3c0ea4d1",
     "ty": "ty==0.0.20",
     "sacrebleu": "sacrebleu>=1.4.12,<2.0.0",
     "sacremoses": "sacremoses",
diff --git a/src/transformers/models/conditional_detr/image_processing_pil_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_pil_conditional_detr.py
index 359c4c706f7c..30b6e2752273 100644
--- a/src/transformers/models/conditional_detr/image_processing_pil_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/image_processing_pil_conditional_detr.py
@@ -48,9 +48,10 @@
     get_max_height_width,
     validate_annotations,
 )
-from ...processing_utils import ImagesKwargs, Unpack
+from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring, is_torch_available, is_vision_available, logging, requires_backends
 from ...utils.import_utils import requires
+from .image_processing_conditional_detr import ConditionalDetrImageProcessorKwargs
 
 
 if is_vision_available():
@@ -61,21 +62,6 @@
 
 logger = logging.get_logger(__name__)
 
-
-class ConditionalDetrImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
-        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
-    do_convert_annotations (`bool`, *optional*, defaults to `True`):
-        Controls whether to convert the annotations to the format expected by the CONDITIONAL_DETR model. Converts the
-        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
-        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
-    """
-
-    format: str | AnnotationFormat
-    do_convert_annotations: bool
-
-
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
diff --git a/src/transformers/models/deepseek_vl/image_processing_pil_deepseek_vl.py b/src/transformers/models/deepseek_vl/image_processing_pil_deepseek_vl.py
index e868830b0220..d29296535277 100644
--- a/src/transformers/models/deepseek_vl/image_processing_pil_deepseek_vl.py
+++ b/src/transformers/models/deepseek_vl/image_processing_pil_deepseek_vl.py
@@ -32,18 +32,9 @@
     PILImageResampling,
     SizeDict,
 )
-from ...processing_utils import ImagesKwargs, Unpack
+from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring
-
-
-class DeepseekVLImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    min_size (`int`, *optional*, defaults to 14):
-        The minimum allowed size for the resized image. Ensures that neither the height nor width
-        falls below this value after resizing.
-    """
-
-    min_size: int
+from .image_processing_deepseek_vl import DeepseekVLImageProcessorKwargs
 
 
 @auto_docstring
diff --git a/src/transformers/models/deepseek_vl_hybrid/image_processing_pil_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/image_processing_pil_deepseek_vl_hybrid.py
index 55573c35c423..c7ef92dce05f 100644
--- a/src/transformers/models/deepseek_vl_hybrid/image_processing_pil_deepseek_vl_hybrid.py
+++ b/src/transformers/models/deepseek_vl_hybrid/image_processing_pil_deepseek_vl_hybrid.py
@@ -19,7 +19,6 @@
 # limitations under the License.
 
 from collections.abc import Iterable
-from typing import Union
 
 import numpy as np
 
@@ -34,34 +33,9 @@
     PILImageResampling,
     SizeDict,
 )
-from ...processing_utils import ImagesKwargs, Unpack
+from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring
-
-
-class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    min_size (`int`, *optional*, defaults to 14):
-        The minimum allowed size for the resized image. Ensures that neither the height nor width
-        falls below this value after resizing.
-     high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`):
-        Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess`
-        method.
-    high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
-        Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
-        overridden by the `high_res_resample` parameter in the `preprocess` method.
-    high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
-        Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of
-        channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method.
-    high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`):
-        Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the
-        number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
-    """
-
-    min_size: int
-    high_res_size: dict
-    high_res_resample: Union["PILImageResampling", int]
-    high_res_image_mean: float | list[float] | tuple[float, ...]
-    high_res_image_std: float | list[float] | tuple[float, ...]
+from .image_processing_deepseek_vl_hybrid import DeepseekVLHybridImageProcessorKwargs
 
 
 @auto_docstring
diff --git a/src/transformers/models/deformable_detr/image_processing_pil_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_pil_deformable_detr.py
index fcd95fa4647f..dd66876deca4 100644
--- a/src/transformers/models/deformable_detr/image_processing_pil_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/image_processing_pil_deformable_detr.py
@@ -47,9 +47,10 @@
     get_max_height_width,
     validate_annotations,
 )
-from ...processing_utils import ImagesKwargs, Unpack
+from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring, is_torch_available, is_vision_available
 from ...utils.import_utils import requires, requires_backends
+from .image_processing_deformable_detr import DeformableDetrImageProcessorKwargs
 
 
 if is_vision_available():
@@ -57,21 +58,6 @@
 if is_torch_available():
     import torch
 
-
-class DeformableDetrImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
-        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
-    do_convert_annotations (`bool`, *optional*, defaults to `True`):
-        Controls whether to convert the annotations to the format expected by the DEFORMABLE_DETR model. Converts the
-        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
-        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
-    """
-
-    format: str | AnnotationFormat
-    do_convert_annotations: bool
-
-
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
diff --git a/src/transformers/models/efficientloftr/image_processing_pil_efficientloftr.py b/src/transformers/models/efficientloftr/image_processing_pil_efficientloftr.py
index 5f467c56dd4f..66f7314143f3 100644
--- a/src/transformers/models/efficientloftr/image_processing_pil_efficientloftr.py
+++ b/src/transformers/models/efficientloftr/image_processing_pil_efficientloftr.py
@@ -21,9 +21,10 @@
     is_valid_image,
     to_numpy_array,
 )
-from ...processing_utils import ImagesKwargs, Unpack
+from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring, is_torch_available
 from ...utils.import_utils import requires
+from .image_processing_efficientloftr import EfficientLoFTRImageProcessorKwargs
 
 
 if TYPE_CHECKING:
@@ -32,15 +33,6 @@
     import torch
 
 
-class EfficientLoFTRImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    do_grayscale (`bool`, *optional*, defaults to `self.do_grayscale`):
-        Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
-    """
-
-    do_grayscale: bool
-
-
 def is_grayscale(image: np.ndarray):
     if image.shape[0] == 1:
         return True
diff --git a/src/transformers/models/ernie4_5_vl_moe/image_processing_pil_ernie4_5_vl_moe.py b/src/transformers/models/ernie4_5_vl_moe/image_processing_pil_ernie4_5_vl_moe.py
index 7f372c3af02d..8aed9c816627 100644
--- a/src/transformers/models/ernie4_5_vl_moe/image_processing_pil_ernie4_5_vl_moe.py
+++ b/src/transformers/models/ernie4_5_vl_moe/image_processing_pil_ernie4_5_vl_moe.py
@@ -25,28 +25,14 @@
 from ...image_processing_backends import PilBackend
 from ...image_processing_utils import BatchFeature
 from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ImageInput, PILImageResampling, SizeDict
-from ...processing_utils import ImagesKwargs, Unpack
+from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring, logging
+from .image_processing_ernie4_5_vl_moe import Ernie4_5_VLMoeImageProcessorKwargs
 
 
 logger = logging.get_logger(__name__)
 
 
-class Ernie4_5_VLMoeImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    patch_size (`int`, *optional*, defaults to 14):
-        The spatial patch size of the vision encoder.
-    temporal_patch_size (`int`, *optional*):
-        The temporal patch size of the vision encoder. Unused in the image processor, only used for videos.
-    merge_size (`int`, *optional*, defaults to 2):
-        The merge size of the vision encoder to llm encoder.
-    """
-
-    patch_size: int
-    temporal_patch_size: int
-    merge_size: int
-
-
 def smart_resize(
     height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 1280
 ):
diff --git a/src/transformers/models/glm46v/image_processing_pil_glm46v.py b/src/transformers/models/glm46v/image_processing_pil_glm46v.py
index 5601e732c2b3..934988f738c8 100644
--- a/src/transformers/models/glm46v/image_processing_pil_glm46v.py
+++ b/src/transformers/models/glm46v/image_processing_pil_glm46v.py
@@ -26,23 +26,9 @@
 from ...image_processing_backends import PilBackend
 from ...image_processing_utils import BatchFeature
 from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ImageInput, PILImageResampling, SizeDict
-from ...processing_utils import ImagesKwargs, Unpack
+from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring
-
-
-class Glm46VImageProcessorKwargs(ImagesKwargs, total=False):
-    """
-    patch_size (`int`, *optional*, defaults to 14):
-        The spatial patch size of the vision encoder.
-    temporal_patch_size (`int`, *optional*, defaults to 2):
-        The temporal patch size of the vision encoder.
-    merge_size (`int`, *optional*, defaults to 2):
-        The merge size of the vision encoder to llm encoder.
-    """
-
-    patch_size: int
-    temporal_patch_size: int
-    merge_size: int
+from .image_processing_glm46v import Glm46VImageProcessorKwargs
 
 
 # Adapted from transformers.models.glm46v.image_processing_glm46v.smart_resize
diff --git a/src/transformers/models/glm_image/image_processing_pil_glm_image.py b/src/transformers/models/glm_image/image_processing_pil_glm_image.py
index 2dde18ef2066..0aaf95a9aaea 100644
--- a/src/transformers/models/glm_image/image_processing_pil_glm_image.py
+++ b/src/transformers/models/glm_image/image_processing_pil_glm_image.py
@@ -26,30 +26,9 @@
 from ...feature_extraction_utils import BatchFeature
 from ...image_processing_backends import PilBackend
 from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ImageInput, PILImageResampling, SizeDict
-from ...processing_utils import ImagesKwargs, Unpack
+from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring
-
-
-# Adapted from transformers.models.glm_image.image_processing_glm_image.GlmImageImageProcessorKwargs
-class GlmImageImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    min_pixels (`int`, *optional*, defaults to `56 * 56`):
-        The min pixels of the image to resize the image.
-    max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
-        The max pixels of the image to resize the image.
-    patch_size (`int`, *optional*, defaults to 14):
-        The spatial patch size of the vision encoder.
-    temporal_patch_size (`int`, *optional*, defaults to 2):
-        The temporal patch size of the vision encoder.
-    merge_size (`int`, *optional*, defaults to 2):
-        The merge size of the vision encoder to llm encoder.
-    """
-
-    min_pixels: int
-    max_pixels: int
-    patch_size: int
-    temporal_patch_size: int
-    merge_size: int
+from .image_processing_glm_image import GlmImageImageProcessorKwargs
 
 
 def smart_resize(
diff --git a/src/transformers/models/grounding_dino/image_processing_pil_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_pil_grounding_dino.py
index 31c59e5f3930..fbdbef4110b4 100644
--- a/src/transformers/models/grounding_dino/image_processing_pil_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/image_processing_pil_grounding_dino.py
@@ -53,9 +53,10 @@
     get_max_height_width,
     validate_annotations,
 )
-from ...processing_utils import ImagesKwargs, Unpack
+from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring, is_torch_available, is_vision_available, requires_backends
 from ...utils.import_utils import requires
+from .image_processing_grounding_dino import GroundingDinoImageProcessorKwargs
 
 
 if TYPE_CHECKING:
@@ -67,21 +68,6 @@
 if is_torch_available():
     import torch
 
-
-class GroundingDinoImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
-        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
-    do_convert_annotations (`bool`, *optional*, defaults to `True`):
-        Controls whether to convert the annotations to the format expected by the GROUNDING_DINO model. Converts the
-        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
-        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
-    """
-
-    format: str | AnnotationFormat
-    do_convert_annotations: bool
-
-
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
diff --git a/src/transformers/models/lightglue/image_processing_pil_lightglue.py b/src/transformers/models/lightglue/image_processing_pil_lightglue.py
index 9f43fe1bbc7a..6283a2e1a2c5 100644
--- a/src/transformers/models/lightglue/image_processing_pil_lightglue.py
+++ b/src/transformers/models/lightglue/image_processing_pil_lightglue.py
@@ -35,9 +35,10 @@
     is_valid_image,
     to_numpy_array,
 )
-from ...processing_utils import ImagesKwargs, Unpack
+from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring, is_torch_available
 from ...utils.import_utils import requires
+from .image_processing_lightglue import LightGlueImageProcessorKwargs
 
 
 if TYPE_CHECKING:
@@ -46,15 +47,6 @@
     import torch
 
 
-class LightGlueImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    do_grayscale (`bool`, *optional*, defaults to `self.do_grayscale`):
-        Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
-    """
-
-    do_grayscale: bool
-
-
 def is_grayscale(image: np.ndarray):
     if image.shape[0] == 1:
         return True
diff --git a/src/transformers/models/llava_onevision/image_processing_pil_llava_onevision.py b/src/transformers/models/llava_onevision/image_processing_pil_llava_onevision.py
index 23534a65d70f..b894b72025b9 100644
--- a/src/transformers/models/llava_onevision/image_processing_pil_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/image_processing_pil_llava_onevision.py
@@ -32,19 +32,9 @@
     PILImageResampling,
     SizeDict,
 )
-from ...processing_utils import ImagesKwargs, Unpack
+from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring
-
-
-class LlavaOnevisionImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    image_grid_pinpoints (`list[list[int]]`, *optional*):
-        A list of possible resolutions to use for processing high resolution images. The best resolution is selected
-        based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess`
-        method.
-    """
-
-    image_grid_pinpoints: list[list[int]]
+from .image_processing_llava_onevision import LlavaOnevisionImageProcessorKwargs
 
 
 @auto_docstring
diff --git a/src/transformers/models/mask2former/image_processing_pil_mask2former.py b/src/transformers/models/mask2former/image_processing_pil_mask2former.py
index 8358a3601bed..2f13d1084ffa 100644
--- a/src/transformers/models/mask2former/image_processing_pil_mask2former.py
+++ b/src/transformers/models/mask2former/image_processing_pil_mask2former.py
@@ -39,9 +39,10 @@
     get_image_size_for_max_height_width,
     get_max_height_width,
 )
-from ...processing_utils import ImagesKwargs, Unpack
+from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring, is_torch_available, logging, requires_backends
 from ...utils.import_utils import requires
+from .image_processing_mask2former import Mask2FormerImageProcessorKwargs
 
 
 if is_torch_available():
@@ -51,32 +52,6 @@
 logger = logging.get_logger(__name__)
 
 
-class Mask2FormerImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    ignore_index (`int`, *optional*):
-        Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
-        denoted with 0 (background) will be replaced with `ignore_index`.
-    do_reduce_labels (`bool`, *optional*, defaults to `False`):
-        Whether or not to decrement all label values of segmentation maps by 1. Usually used for datasets where 0
-        is used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k).
-        The background label will be replaced by `ignore_index`.
-    num_labels (`int`, *optional*):
-        The number of labels in the segmentation map.
-    size_divisor (`int`, *optional*, defaults to `32`):
-        Some backbones need images divisible by a certain number. If not passed, it defaults to the value used in
-        Swin Transformer.
-    pad_size (`SizeDict`, *optional*):
-        The size to pad the images to. Must be larger than any image size provided for preprocessing. If `pad_size`
-        is not provided, images will be padded to the largest height and width in the batch.
-    """
-
-    ignore_index: int | None
-    do_reduce_labels: bool
-    num_labels: int | None
-    size_divisor: int
-    pad_size: SizeDict | None
-
-
 def convert_segmentation_map_to_binary_masks(
     segmentation_map: np.ndarray,
     instance_id_to_semantic_id: dict[int, int] | None = None,
diff --git a/src/transformers/models/paddleocr_vl/image_processing_pil_paddleocr_vl.py b/src/transformers/models/paddleocr_vl/image_processing_pil_paddleocr_vl.py
index ac639892640f..560b7869ddb9 100644
--- a/src/transformers/models/paddleocr_vl/image_processing_pil_paddleocr_vl.py
+++ b/src/transformers/models/paddleocr_vl/image_processing_pil_paddleocr_vl.py
@@ -31,25 +31,9 @@
 from ...image_processing_backends import PilBackend
 from ...image_processing_utils import BatchFeature
 from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ImageInput, PILImageResampling, SizeDict
-from ...processing_utils import ImagesKwargs, Unpack
+from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring
-
-
-class PaddleOCRVLImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    patch_size (`int`, *optional*, defaults to 14):
-        The spatial patch size of the vision encoder.
-    temporal_patch_size (`int`, *optional*, defaults to 1):
-        The temporal patch size of the vision encoder.
-    merge_size (`int`, *optional*, defaults to 2):
-        The merge size of the vision encoder to llm encoder.
-    """
-
-    min_pixels: int
-    max_pixels: int
-    patch_size: int
-    temporal_patch_size: int
-    merge_size: int
+from .image_processing_paddleocr_vl import PaddleOCRVLImageProcessorKwargs
 
 
 def smart_resize(
diff --git a/src/transformers/models/rt_detr/image_processing_pil_rt_detr.py b/src/transformers/models/rt_detr/image_processing_pil_rt_detr.py
index 1fe55d067653..606b5640602c 100644
--- a/src/transformers/models/rt_detr/image_processing_pil_rt_detr.py
+++ b/src/transformers/models/rt_detr/image_processing_pil_rt_detr.py
@@ -46,29 +46,15 @@
     get_max_height_width,
     validate_annotations,
 )
-from ...processing_utils import ImagesKwargs, Unpack
+from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring, is_torch_available, requires_backends
 from ...utils.import_utils import requires
+from .image_processing_rt_detr import RTDetrImageProcessorKwargs
 
 
 if is_torch_available():
     import torch
 
-
-class RTDetrImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
-        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
-    do_convert_annotations (`bool`, *optional*, defaults to `True`):
-        Controls whether to convert the annotations to the format expected by the RT_DETR model. Converts the
-        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
-        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
-    """
-
-    format: str | AnnotationFormat
-    do_convert_annotations: bool
-
-
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
diff --git a/src/transformers/models/segformer/image_processing_pil_segformer.py b/src/transformers/models/segformer/image_processing_pil_segformer.py
index f1d0bb0f627b..77514873c59a 100644
--- a/src/transformers/models/segformer/image_processing_pil_segformer.py
+++ b/src/transformers/models/segformer/image_processing_pil_segformer.py
@@ -31,9 +31,10 @@
     PILImageResampling,
     SizeDict,
 )
-from ...processing_utils import ImagesKwargs, Unpack
+from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring, is_torch_available, is_torchvision_available
 from ...utils.import_utils import requires
+from .image_processing_segformer import SegformerImageProcessorKwargs
 
 
 if is_torch_available():
@@ -42,17 +43,6 @@
     import torchvision.transforms.v2.functional as tvF
 
 
-class SegformerImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
-        Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
-        is used for background, and background itself is not included in all classes of a dataset (e.g.
-        ADE20k). The background label will be replaced by 255.
-    """
-
-    do_reduce_labels: bool
-
-
 @requires(backends=("torch", "torchvision"))
 class SegformerImageProcessorPil(PilBackend):
     """PIL backend for Segformer with reduce_label support."""
diff --git a/src/transformers/models/smolvlm/image_processing_pil_smolvlm.py b/src/transformers/models/smolvlm/image_processing_pil_smolvlm.py
index 3d53ed09c11f..29f3a89f3418 100644
--- a/src/transformers/models/smolvlm/image_processing_pil_smolvlm.py
+++ b/src/transformers/models/smolvlm/image_processing_pil_smolvlm.py
@@ -35,24 +35,9 @@
     SizeDict,
     make_nested_list_of_images,
 )
-from ...processing_utils import ImagesKwargs, Unpack
+from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring
-
-
-class SmolVLMImageProcessorKwargs(ImagesKwargs, total=False):
-    """
-    do_image_splitting (`bool`, *optional*, defaults to `True`):
-        Whether to split the image into sub-images concatenated with the original image. They are split into patches
-        such that each patch has a size of `max_image_size["height"]` x `max_image_size["width"]`.
-    max_image_size (`Dict`, *optional*, defaults to `{"longest_edge": 364}`):
-        Maximum resolution of the patches of images accepted by the model. This is a dictionary containing the key "longest_edge".
-    return_row_col_info (`bool`, *optional*, defaults to `False`):
-        Whether to return the row and column information of the images.
-    """
-
-    do_image_splitting: bool
-    max_image_size: dict[str, int]
-    return_row_col_info: bool
+from .image_processing_smolvlm import SmolVLMImageProcessorKwargs
 
 
 def _make_pixel_mask(image: np.ndarray, output_size: tuple[int, int]) -> np.ndarray:
diff --git a/src/transformers/models/video_llama_3/image_processing_pil_video_llama_3.py b/src/transformers/models/video_llama_3/image_processing_pil_video_llama_3.py
index 5272c7465b2b..46f1cbb7d25d 100644
--- a/src/transformers/models/video_llama_3/image_processing_pil_video_llama_3.py
+++ b/src/transformers/models/video_llama_3/image_processing_pil_video_llama_3.py
@@ -26,29 +26,9 @@
 from ...feature_extraction_utils import BatchFeature
 from ...image_processing_backends import PilBackend
 from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, ImageInput, PILImageResampling, SizeDict
-from ...processing_utils import ImagesKwargs, Unpack
+from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring
-
-
-class VideoLlama3ImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    min_pixels (`int`, *optional*, defaults to `56 * 56`):
-        The min pixels of the image to resize the image.
-    max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
-        The max pixels of the image to resize the image.
-    patch_size (`int`, *optional*, defaults to 14):
-        The spatial patch size of the vision encoder.
-    temporal_patch_size (`int`, *optional*, defaults to 2):
-        The temporal patch size of the vision encoder.
-    merge_size (`int`, *optional*, defaults to 2):
-        The merge size of the vision encoder to llm encoder.
-    """
-
-    min_pixels: int
-    max_pixels: int
-    patch_size: int
-    temporal_patch_size: int
-    merge_size: int
+from .image_processing_video_llama_3 import VideoLlama3ImageProcessorKwargs
 
 
 def smart_resize(
diff --git a/src/transformers/models/yolos/image_processing_pil_yolos.py b/src/transformers/models/yolos/image_processing_pil_yolos.py
index 219348363ea3..7f5b8385d8b9 100644
--- a/src/transformers/models/yolos/image_processing_pil_yolos.py
+++ b/src/transformers/models/yolos/image_processing_pil_yolos.py
@@ -33,9 +33,10 @@
     get_max_height_width,
     validate_annotations,
 )
-from ...processing_utils import ImagesKwargs, Unpack
+from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring, is_torch_available, is_vision_available, requires_backends
 from ...utils.import_utils import requires
+from .image_processing_yolos import YolosImageProcessorKwargs
 
 
 if is_vision_available():
@@ -44,22 +45,6 @@
     import torch
     from torch import nn
 
-
-# Adapted from transformers.models.yolos.image_processing_yolos.YolosImageProcessorKwargs
-class YolosImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
-        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
-    do_convert_annotations (`bool`, *optional*, defaults to `True`):
-        Controls whether to convert the annotations to the format expected by the YOLOS model. Converts the
-        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
-        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
-    """
-
-    format: str | AnnotationFormat
-    do_convert_annotations: bool
-
-
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 

From fc6dc66c0e382888ef5df92870cf643dcce2ba45 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Tue, 21 Apr 2026 18:07:55 +0900
Subject: [PATCH 6/9] fix modular to always copy

---
 .../image_processing_pil_conditional_detr.py  | 17 ++++++++--
 .../image_processing_pil_deepseek_vl.py       | 13 +++++--
 ...image_processing_pil_deepseek_vl_hybrid.py | 30 ++++++++++++++--
 .../image_processing_pil_deformable_detr.py   | 17 ++++++++--
 .../image_processing_pil_efficientloftr.py    | 12 +++++--
 .../image_processing_pil_ernie4_5_vl_moe.py   | 18 ++++++++--
 .../glm46v/image_processing_pil_glm46v.py     | 18 ++++++++--
 .../image_processing_pil_glm_image.py         | 24 +++++++++++--
 .../image_processing_pil_grounding_dino.py    | 17 ++++++++--
 .../image_processing_pil_lightglue.py         | 12 +++++--
 .../image_processing_pil_llava_onevision.py   | 14 ++++++--
 .../image_processing_pil_mask2former.py       | 29 ++++++++++++++--
 .../image_processing_pil_paddleocr_vl.py      | 20 +++++++++--
 .../rt_detr/image_processing_pil_rt_detr.py   | 17 ++++++++--
 .../image_processing_pil_segformer.py         | 14 ++++++--
 .../smolvlm/image_processing_pil_smolvlm.py   | 19 +++++++++--
 .../image_processing_pil_video_llama_3.py     | 24 +++++++++++--
 .../yolos/image_processing_pil_yolos.py       | 17 ++++++++--
 utils/modular_model_converter.py              | 34 +++++++++++++++++++
 19 files changed, 330 insertions(+), 36 deletions(-)

diff --git a/src/transformers/models/conditional_detr/image_processing_pil_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_pil_conditional_detr.py
index 30b6e2752273..30740114d5f0 100644
--- a/src/transformers/models/conditional_detr/image_processing_pil_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/image_processing_pil_conditional_detr.py
@@ -48,10 +48,9 @@
     get_max_height_width,
     validate_annotations,
 )
-from ...processing_utils import Unpack
+from ...processing_utils import ImagesKwargs, Unpack
 from ...utils import TensorType, auto_docstring, is_torch_available, is_vision_available, logging, requires_backends
 from ...utils.import_utils import requires
-from .image_processing_conditional_detr import ConditionalDetrImageProcessorKwargs
 
 
 if is_vision_available():
@@ -65,6 +64,20 @@
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
+class ConditionalDetrImageProcessorKwargs(ImagesKwargs, total=False):
+    r"""
+    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+    do_convert_annotations (`bool`, *optional*, defaults to `True`):
+        Controls whether to convert the annotations to the format expected by the CONDITIONAL_DETR model. Converts the
+        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+    """
+
+    format: str | AnnotationFormat
+    do_convert_annotations: bool
+
+
 # inspired by https://github.com/facebookresearch/conditional_detr/blob/master/datasets/coco.py#L33
 def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray:
     """
diff --git a/src/transformers/models/deepseek_vl/image_processing_pil_deepseek_vl.py b/src/transformers/models/deepseek_vl/image_processing_pil_deepseek_vl.py
index d29296535277..6e2a220e3fd2 100644
--- a/src/transformers/models/deepseek_vl/image_processing_pil_deepseek_vl.py
+++ b/src/transformers/models/deepseek_vl/image_processing_pil_deepseek_vl.py
@@ -32,9 +32,8 @@
     PILImageResampling,
     SizeDict,
 )
-from ...processing_utils import Unpack
+from ...processing_utils import ImagesKwargs, Unpack
 from ...utils import TensorType, auto_docstring
-from .image_processing_deepseek_vl import DeepseekVLImageProcessorKwargs
 
 
 @auto_docstring
@@ -162,4 +161,14 @@ def postprocess(self):
         raise AttributeError("Not needed for DeepseekVL")
 
 
+class DeepseekVLImageProcessorKwargs(ImagesKwargs, total=False):
+    r"""
+    min_size (`int`, *optional*, defaults to 14):
+        The minimum allowed size for the resized image. Ensures that neither the height nor width
+        falls below this value after resizing.
+    """
+
+    min_size: int
+
+
 __all__ = ["DeepseekVLImageProcessorPil"]
diff --git a/src/transformers/models/deepseek_vl_hybrid/image_processing_pil_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/image_processing_pil_deepseek_vl_hybrid.py
index c7ef92dce05f..b1ea56d48a46 100644
--- a/src/transformers/models/deepseek_vl_hybrid/image_processing_pil_deepseek_vl_hybrid.py
+++ b/src/transformers/models/deepseek_vl_hybrid/image_processing_pil_deepseek_vl_hybrid.py
@@ -19,6 +19,7 @@
 # limitations under the License.
 
 from collections.abc import Iterable
+from typing import Union
 
 import numpy as np
 
@@ -33,9 +34,8 @@
     PILImageResampling,
     SizeDict,
 )
-from ...processing_utils import Unpack
+from ...processing_utils import ImagesKwargs, Unpack
 from ...utils import TensorType, auto_docstring
-from .image_processing_deepseek_vl_hybrid import DeepseekVLHybridImageProcessorKwargs
 
 
 @auto_docstring
@@ -232,4 +232,30 @@ def _standardize_kwargs(
         return kwargs
 
 
+class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs, total=False):
+    r"""
+    min_size (`int`, *optional*, defaults to 14):
+        The minimum allowed size for the resized image. Ensures that neither the height nor width
+        falls below this value after resizing.
+     high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`):
+        Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess`
+        method.
+    high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+        Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
+        overridden by the `high_res_resample` parameter in the `preprocess` method.
+    high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
+        Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of
+        channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method.
+    high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`):
+        Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the
+        number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
+    """
+
+    min_size: int
+    high_res_size: dict
+    high_res_resample: Union["PILImageResampling", int]
+    high_res_image_mean: float | list[float] | tuple[float, ...]
+    high_res_image_std: float | list[float] | tuple[float, ...]
+
+
 __all__ = ["DeepseekVLHybridImageProcessorPil"]
diff --git a/src/transformers/models/deformable_detr/image_processing_pil_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_pil_deformable_detr.py
index dd66876deca4..9c7ccc213910 100644
--- a/src/transformers/models/deformable_detr/image_processing_pil_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/image_processing_pil_deformable_detr.py
@@ -47,10 +47,9 @@
     get_max_height_width,
     validate_annotations,
 )
-from ...processing_utils import Unpack
+from ...processing_utils import ImagesKwargs, Unpack
 from ...utils import TensorType, auto_docstring, is_torch_available, is_vision_available
 from ...utils.import_utils import requires, requires_backends
-from .image_processing_deformable_detr import DeformableDetrImageProcessorKwargs
 
 
 if is_vision_available():
@@ -61,6 +60,20 @@
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
+class DeformableDetrImageProcessorKwargs(ImagesKwargs, total=False):
+    r"""
+    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+    do_convert_annotations (`bool`, *optional*, defaults to `True`):
+        Controls whether to convert the annotations to the format expected by the DEFORMABLE_DETR model. Converts the
+        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+    """
+
+    format: str | AnnotationFormat
+    do_convert_annotations: bool
+
+
 # inspired by https://github.com/facebookresearch/deformable_detr/blob/master/datasets/coco.py#L33
 def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray:
     """
diff --git a/src/transformers/models/efficientloftr/image_processing_pil_efficientloftr.py b/src/transformers/models/efficientloftr/image_processing_pil_efficientloftr.py
index 66f7314143f3..7c42d75f2baa 100644
--- a/src/transformers/models/efficientloftr/image_processing_pil_efficientloftr.py
+++ b/src/transformers/models/efficientloftr/image_processing_pil_efficientloftr.py
@@ -21,10 +21,9 @@
     is_valid_image,
     to_numpy_array,
 )
-from ...processing_utils import Unpack
+from ...processing_utils import ImagesKwargs, Unpack
 from ...utils import TensorType, auto_docstring, is_torch_available
 from ...utils.import_utils import requires
-from .image_processing_efficientloftr import EfficientLoFTRImageProcessorKwargs
 
 
 if TYPE_CHECKING:
@@ -39,6 +38,15 @@ def is_grayscale(image: np.ndarray):
     return np.all(image[0, ...] == image[1, ...]) and np.all(image[1, ...] == image[2, ...])
 
 
+class EfficientLoFTRImageProcessorKwargs(ImagesKwargs, total=False):
+    r"""
+    do_grayscale (`bool`, *optional*, defaults to `self.do_grayscale`):
+        Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
+    """
+
+    do_grayscale: bool
+
+
 def convert_to_grayscale(image: ImageInput) -> ImageInput:
     """
     Converts an image to grayscale format using the NTSC formula. Only support numpy and PIL Image.
diff --git a/src/transformers/models/ernie4_5_vl_moe/image_processing_pil_ernie4_5_vl_moe.py b/src/transformers/models/ernie4_5_vl_moe/image_processing_pil_ernie4_5_vl_moe.py
index 8aed9c816627..4b6db850f8da 100644
--- a/src/transformers/models/ernie4_5_vl_moe/image_processing_pil_ernie4_5_vl_moe.py
+++ b/src/transformers/models/ernie4_5_vl_moe/image_processing_pil_ernie4_5_vl_moe.py
@@ -25,9 +25,8 @@
 from ...image_processing_backends import PilBackend
 from ...image_processing_utils import BatchFeature
 from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ImageInput, PILImageResampling, SizeDict
-from ...processing_utils import Unpack
+from ...processing_utils import ImagesKwargs, Unpack
 from ...utils import TensorType, auto_docstring, logging
-from .image_processing_ernie4_5_vl_moe import Ernie4_5_VLMoeImageProcessorKwargs
 
 
 logger = logging.get_logger(__name__)
@@ -62,6 +61,21 @@ def smart_resize(
     return h_bar, w_bar
 
 
+class Ernie4_5_VLMoeImageProcessorKwargs(ImagesKwargs, total=False):
+    r"""
+    patch_size (`int`, *optional*, defaults to 14):
+        The spatial patch size of the vision encoder.
+    temporal_patch_size (`int`, *optional*):
+        The temporal patch size of the vision encoder. Unused in the image processor, only used for videos.
+    merge_size (`int`, *optional*, defaults to 2):
+        The merge size of the vision encoder to llm encoder.
+    """
+
+    patch_size: int
+    temporal_patch_size: int
+    merge_size: int
+
+
 @auto_docstring
 class Ernie4_5_VLMoeImageProcessorPil(PilBackend):
     do_resize = True
diff --git a/src/transformers/models/glm46v/image_processing_pil_glm46v.py b/src/transformers/models/glm46v/image_processing_pil_glm46v.py
index 934988f738c8..5070535f6ecf 100644
--- a/src/transformers/models/glm46v/image_processing_pil_glm46v.py
+++ b/src/transformers/models/glm46v/image_processing_pil_glm46v.py
@@ -26,9 +26,8 @@
 from ...image_processing_backends import PilBackend
 from ...image_processing_utils import BatchFeature
 from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ImageInput, PILImageResampling, SizeDict
-from ...processing_utils import Unpack
+from ...processing_utils import ImagesKwargs, Unpack
 from ...utils import TensorType, auto_docstring
-from .image_processing_glm46v import Glm46VImageProcessorKwargs
 
 
 # Adapted from transformers.models.glm46v.image_processing_glm46v.smart_resize
@@ -68,6 +67,21 @@ def smart_resize(
     return h_bar, w_bar
 
 
+class Glm46VImageProcessorKwargs(ImagesKwargs, total=False):
+    """
+    patch_size (`int`, *optional*, defaults to 14):
+        The spatial patch size of the vision encoder.
+    temporal_patch_size (`int`, *optional*, defaults to 2):
+        The temporal patch size of the vision encoder.
+    merge_size (`int`, *optional*, defaults to 2):
+        The merge size of the vision encoder to llm encoder.
+    """
+
+    patch_size: int
+    temporal_patch_size: int
+    merge_size: int
+
+
 @auto_docstring
 class Glm46VImageProcessorPil(PilBackend):
     do_resize = True
diff --git a/src/transformers/models/glm_image/image_processing_pil_glm_image.py b/src/transformers/models/glm_image/image_processing_pil_glm_image.py
index 0aaf95a9aaea..7861cc32a1ae 100644
--- a/src/transformers/models/glm_image/image_processing_pil_glm_image.py
+++ b/src/transformers/models/glm_image/image_processing_pil_glm_image.py
@@ -26,9 +26,8 @@
 from ...feature_extraction_utils import BatchFeature
 from ...image_processing_backends import PilBackend
 from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ImageInput, PILImageResampling, SizeDict
-from ...processing_utils import Unpack
+from ...processing_utils import ImagesKwargs, Unpack
 from ...utils import TensorType, auto_docstring
-from .image_processing_glm_image import GlmImageImageProcessorKwargs
 
 
 def smart_resize(
@@ -72,6 +71,27 @@ def smart_resize(
     return h_bar, w_bar
 
 
+class GlmImageImageProcessorKwargs(ImagesKwargs, total=False):
+    r"""
+    min_pixels (`int`, *optional*, defaults to `56 * 56`):
+        The min pixels of the image to resize the image.
+    max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
+        The max pixels of the image to resize the image.
+    patch_size (`int`, *optional*, defaults to 14):
+        The spatial patch size of the vision encoder.
+    temporal_patch_size (`int`, *optional*, defaults to 2):
+        The temporal patch size of the vision encoder.
+    merge_size (`int`, *optional*, defaults to 2):
+        The merge size of the vision encoder to llm encoder.
+    """
+
+    min_pixels: int
+    max_pixels: int
+    patch_size: int
+    temporal_patch_size: int
+    merge_size: int
+
+
 @auto_docstring
 class GlmImageImageProcessorPil(PilBackend):
     do_resize = True
diff --git a/src/transformers/models/grounding_dino/image_processing_pil_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_pil_grounding_dino.py
index fbdbef4110b4..c95d7cb386bd 100644
--- a/src/transformers/models/grounding_dino/image_processing_pil_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/image_processing_pil_grounding_dino.py
@@ -53,10 +53,9 @@
     get_max_height_width,
     validate_annotations,
 )
-from ...processing_utils import Unpack
+from ...processing_utils import ImagesKwargs, Unpack
 from ...utils import TensorType, auto_docstring, is_torch_available, is_vision_available, requires_backends
 from ...utils.import_utils import requires
-from .image_processing_grounding_dino import GroundingDinoImageProcessorKwargs
 
 
 if TYPE_CHECKING:
@@ -71,6 +70,20 @@
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
+class GroundingDinoImageProcessorKwargs(ImagesKwargs, total=False):
+    r"""
+    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+    do_convert_annotations (`bool`, *optional*, defaults to `True`):
+        Controls whether to convert the annotations to the format expected by the GROUNDING_DINO model. Converts the
+        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+    """
+
+    format: str | AnnotationFormat
+    do_convert_annotations: bool
+
+
 # inspired by https://github.com/facebookresearch/grounding_dino/blob/master/datasets/coco.py#L33
 def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray:
     """
diff --git a/src/transformers/models/lightglue/image_processing_pil_lightglue.py b/src/transformers/models/lightglue/image_processing_pil_lightglue.py
index 6283a2e1a2c5..77389f8e8da3 100644
--- a/src/transformers/models/lightglue/image_processing_pil_lightglue.py
+++ b/src/transformers/models/lightglue/image_processing_pil_lightglue.py
@@ -35,10 +35,9 @@
     is_valid_image,
     to_numpy_array,
 )
-from ...processing_utils import Unpack
+from ...processing_utils import ImagesKwargs, Unpack
 from ...utils import TensorType, auto_docstring, is_torch_available
 from ...utils.import_utils import requires
-from .image_processing_lightglue import LightGlueImageProcessorKwargs
 
 
 if TYPE_CHECKING:
@@ -53,6 +52,15 @@ def is_grayscale(image: np.ndarray):
     return np.all(image[0, ...] == image[1, ...]) and np.all(image[1, ...] == image[2, ...])
 
 
+class LightGlueImageProcessorKwargs(ImagesKwargs, total=False):
+    r"""
+    do_grayscale (`bool`, *optional*, defaults to `self.do_grayscale`):
+        Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
+    """
+
+    do_grayscale: bool
+
+
 def convert_to_grayscale(image: ImageInput) -> ImageInput:
     """
     Converts an image to grayscale format using the NTSC formula. Only support numpy and PIL Image.
diff --git a/src/transformers/models/llava_onevision/image_processing_pil_llava_onevision.py b/src/transformers/models/llava_onevision/image_processing_pil_llava_onevision.py
index b894b72025b9..96a973ead67d 100644
--- a/src/transformers/models/llava_onevision/image_processing_pil_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/image_processing_pil_llava_onevision.py
@@ -32,9 +32,8 @@
     PILImageResampling,
     SizeDict,
 )
-from ...processing_utils import Unpack
+from ...processing_utils import ImagesKwargs, Unpack
 from ...utils import TensorType, auto_docstring
-from .image_processing_llava_onevision import LlavaOnevisionImageProcessorKwargs
 
 
 @auto_docstring
@@ -294,4 +293,15 @@ def pad_to_square(
         return result
 
 
+class LlavaOnevisionImageProcessorKwargs(ImagesKwargs, total=False):
+    r"""
+    image_grid_pinpoints (`list[list[int]]`, *optional*):
+        A list of possible resolutions to use for processing high resolution images. The best resolution is selected
+        based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess`
+        method.
+    """
+
+    image_grid_pinpoints: list[list[int]]
+
+
 __all__ = ["LlavaOnevisionImageProcessorPil"]
diff --git a/src/transformers/models/mask2former/image_processing_pil_mask2former.py b/src/transformers/models/mask2former/image_processing_pil_mask2former.py
index 2f13d1084ffa..6b27657b3677 100644
--- a/src/transformers/models/mask2former/image_processing_pil_mask2former.py
+++ b/src/transformers/models/mask2former/image_processing_pil_mask2former.py
@@ -39,10 +39,9 @@
     get_image_size_for_max_height_width,
     get_max_height_width,
 )
-from ...processing_utils import Unpack
+from ...processing_utils import ImagesKwargs, Unpack
 from ...utils import TensorType, auto_docstring, is_torch_available, logging, requires_backends
 from ...utils.import_utils import requires
-from .image_processing_mask2former import Mask2FormerImageProcessorKwargs
 
 
 if is_torch_available():
@@ -88,6 +87,32 @@ def convert_segmentation_map_to_binary_masks(
     return binary_masks.astype(np.float32), labels
 
 
+class Mask2FormerImageProcessorKwargs(ImagesKwargs, total=False):
+    r"""
+    ignore_index (`int`, *optional*):
+        Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
+        denoted with 0 (background) will be replaced with `ignore_index`.
+    do_reduce_labels (`bool`, *optional*, defaults to `False`):
+        Whether or not to decrement all label values of segmentation maps by 1. Usually used for datasets where 0
+        is used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k).
+        The background label will be replaced by `ignore_index`.
+    num_labels (`int`, *optional*):
+        The number of labels in the segmentation map.
+    size_divisor (`int`, *optional*, defaults to `32`):
+        Some backbones need images divisible by a certain number. If not passed, it defaults to the value used in
+        Swin Transformer.
+    pad_size (`SizeDict`, *optional*):
+        The size to pad the images to. Must be larger than any image size provided for preprocessing. If `pad_size`
+        is not provided, images will be padded to the largest height and width in the batch.
+    """
+
+    ignore_index: int | None
+    do_reduce_labels: bool
+    num_labels: int | None
+    size_divisor: int
+    pad_size: SizeDict | None
+
+
 # Adapted from transformers.models.mask2former.image_processing_mask2former.binary_mask_to_rle
 def binary_mask_to_rle(mask):
     """
diff --git a/src/transformers/models/paddleocr_vl/image_processing_pil_paddleocr_vl.py b/src/transformers/models/paddleocr_vl/image_processing_pil_paddleocr_vl.py
index 560b7869ddb9..c524acc0debc 100644
--- a/src/transformers/models/paddleocr_vl/image_processing_pil_paddleocr_vl.py
+++ b/src/transformers/models/paddleocr_vl/image_processing_pil_paddleocr_vl.py
@@ -31,9 +31,8 @@
 from ...image_processing_backends import PilBackend
 from ...image_processing_utils import BatchFeature
 from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ImageInput, PILImageResampling, SizeDict
-from ...processing_utils import Unpack
+from ...processing_utils import ImagesKwargs, Unpack
 from ...utils import TensorType, auto_docstring
-from .image_processing_paddleocr_vl import PaddleOCRVLImageProcessorKwargs
 
 
 def smart_resize(
@@ -68,6 +67,23 @@ def smart_resize(
     return h_bar, w_bar
 
 
+class PaddleOCRVLImageProcessorKwargs(ImagesKwargs, total=False):
+    r"""
+    patch_size (`int`, *optional*, defaults to 14):
+        The spatial patch size of the vision encoder.
+    temporal_patch_size (`int`, *optional*, defaults to 1):
+        The temporal patch size of the vision encoder.
+    merge_size (`int`, *optional*, defaults to 2):
+        The merge size of the vision encoder to llm encoder.
+    """
+
+    min_pixels: int
+    max_pixels: int
+    patch_size: int
+    temporal_patch_size: int
+    merge_size: int
+
+
 @auto_docstring
 class PaddleOCRVLImageProcessorPil(PilBackend):
     do_resize = True
diff --git a/src/transformers/models/rt_detr/image_processing_pil_rt_detr.py b/src/transformers/models/rt_detr/image_processing_pil_rt_detr.py
index 606b5640602c..669843e9f949 100644
--- a/src/transformers/models/rt_detr/image_processing_pil_rt_detr.py
+++ b/src/transformers/models/rt_detr/image_processing_pil_rt_detr.py
@@ -46,10 +46,9 @@
     get_max_height_width,
     validate_annotations,
 )
-from ...processing_utils import Unpack
+from ...processing_utils import ImagesKwargs, Unpack
 from ...utils import TensorType, auto_docstring, is_torch_available, requires_backends
 from ...utils.import_utils import requires
-from .image_processing_rt_detr import RTDetrImageProcessorKwargs
 
 
 if is_torch_available():
@@ -58,6 +57,20 @@
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
+class RTDetrImageProcessorKwargs(ImagesKwargs, total=False):
+    r"""
+    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+    do_convert_annotations (`bool`, *optional*, defaults to `True`):
+        Controls whether to convert the annotations to the format expected by the RT_DETR model. Converts the
+        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+    """
+
+    format: str | AnnotationFormat
+    do_convert_annotations: bool
+
+
 def prepare_coco_detection_annotation_pil(
     image,
     target,
diff --git a/src/transformers/models/segformer/image_processing_pil_segformer.py b/src/transformers/models/segformer/image_processing_pil_segformer.py
index 77514873c59a..7bffa8ab490f 100644
--- a/src/transformers/models/segformer/image_processing_pil_segformer.py
+++ b/src/transformers/models/segformer/image_processing_pil_segformer.py
@@ -31,10 +31,9 @@
     PILImageResampling,
     SizeDict,
 )
-from ...processing_utils import Unpack
+from ...processing_utils import ImagesKwargs, Unpack
 from ...utils import TensorType, auto_docstring, is_torch_available, is_torchvision_available
 from ...utils.import_utils import requires
-from .image_processing_segformer import SegformerImageProcessorKwargs
 
 
 if is_torch_available():
@@ -210,4 +209,15 @@ def post_process_semantic_segmentation(self, outputs, target_sizes: list[tuple]
         return semantic_segmentation
 
 
+class SegformerImageProcessorKwargs(ImagesKwargs, total=False):
+    r"""
+    do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
+        Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
+        is used for background, and background itself is not included in all classes of a dataset (e.g.
+        ADE20k). The background label will be replaced by 255.
+    """
+
+    do_reduce_labels: bool
+
+
 __all__ = ["SegformerImageProcessorPil"]
diff --git a/src/transformers/models/smolvlm/image_processing_pil_smolvlm.py b/src/transformers/models/smolvlm/image_processing_pil_smolvlm.py
index 29f3a89f3418..dea8fad98b32 100644
--- a/src/transformers/models/smolvlm/image_processing_pil_smolvlm.py
+++ b/src/transformers/models/smolvlm/image_processing_pil_smolvlm.py
@@ -35,9 +35,8 @@
     SizeDict,
     make_nested_list_of_images,
 )
-from ...processing_utils import Unpack
+from ...processing_utils import ImagesKwargs, Unpack
 from ...utils import TensorType, auto_docstring
-from .image_processing_smolvlm import SmolVLMImageProcessorKwargs
 
 
 def _make_pixel_mask(image: np.ndarray, output_size: tuple[int, int]) -> np.ndarray:
@@ -48,6 +47,22 @@ def _make_pixel_mask(image: np.ndarray, output_size: tuple[int, int]) -> np.ndar
     return mask
 
 
+class SmolVLMImageProcessorKwargs(ImagesKwargs, total=False):
+    """
+    do_image_splitting (`bool`, *optional*, defaults to `True`):
+        Whether to split the image into sub-images concatenated with the original image. They are split into patches
+        such that each patch has a size of `max_image_size["height"]` x `max_image_size["width"]`.
+    max_image_size (`Dict`, *optional*, defaults to `{"longest_edge": 364}`):
+        Maximum resolution of the patches of images accepted by the model. This is a dictionary containing the key "longest_edge".
+    return_row_col_info (`bool`, *optional*, defaults to `False`):
+        Whether to return the row and column information of the images.
+    """
+
+    do_image_splitting: bool
+    max_image_size: dict[str, int]
+    return_row_col_info: bool
+
+
 # Adapted from transformers.models.smolvlm.image_processing_smolvlm.MAX_IMAGE_SIZE
 MAX_IMAGE_SIZE = 4096  # 4k resolution as absolute maximum
 
diff --git a/src/transformers/models/video_llama_3/image_processing_pil_video_llama_3.py b/src/transformers/models/video_llama_3/image_processing_pil_video_llama_3.py
index 46f1cbb7d25d..a48e79e09936 100644
--- a/src/transformers/models/video_llama_3/image_processing_pil_video_llama_3.py
+++ b/src/transformers/models/video_llama_3/image_processing_pil_video_llama_3.py
@@ -26,9 +26,8 @@
 from ...feature_extraction_utils import BatchFeature
 from ...image_processing_backends import PilBackend
 from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, ImageInput, PILImageResampling, SizeDict
-from ...processing_utils import Unpack
+from ...processing_utils import ImagesKwargs, Unpack
 from ...utils import TensorType, auto_docstring
-from .image_processing_video_llama_3 import VideoLlama3ImageProcessorKwargs
 
 
 def smart_resize(
@@ -60,6 +59,27 @@ def smart_resize(
     return h_bar, w_bar
 
 
+class VideoLlama3ImageProcessorKwargs(ImagesKwargs, total=False):
+    r"""
+    min_pixels (`int`, *optional*, defaults to `56 * 56`):
+        The min pixels of the image to resize the image.
+    max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
+        The max pixels of the image to resize the image.
+    patch_size (`int`, *optional*, defaults to 14):
+        The spatial patch size of the vision encoder.
+    temporal_patch_size (`int`, *optional*, defaults to 2):
+        The temporal patch size of the vision encoder.
+    merge_size (`int`, *optional*, defaults to 2):
+        The merge size of the vision encoder to llm encoder.
+    """
+
+    min_pixels: int
+    max_pixels: int
+    patch_size: int
+    temporal_patch_size: int
+    merge_size: int
+
+
 @auto_docstring
 class VideoLlama3ImageProcessorPil(PilBackend):
     do_resize = True
diff --git a/src/transformers/models/yolos/image_processing_pil_yolos.py b/src/transformers/models/yolos/image_processing_pil_yolos.py
index 7f5b8385d8b9..f42fb5a63701 100644
--- a/src/transformers/models/yolos/image_processing_pil_yolos.py
+++ b/src/transformers/models/yolos/image_processing_pil_yolos.py
@@ -33,10 +33,9 @@
     get_max_height_width,
     validate_annotations,
 )
-from ...processing_utils import Unpack
+from ...processing_utils import ImagesKwargs, Unpack
 from ...utils import TensorType, auto_docstring, is_torch_available, is_vision_available, requires_backends
 from ...utils.import_utils import requires
-from .image_processing_yolos import YolosImageProcessorKwargs
 
 
 if is_vision_available():
@@ -48,6 +47,20 @@
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
+class YolosImageProcessorKwargs(ImagesKwargs, total=False):
+    r"""
+    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+    do_convert_annotations (`bool`, *optional*, defaults to `True`):
+        Controls whether to convert the annotations to the format expected by the YOLOS model. Converts the
+        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+    """
+
+    format: str | AnnotationFormat
+    do_convert_annotations: bool
+
+
 # inspired by https://github.com/facebookresearch/yolos/blob/master/datasets/coco.py#L33
 def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray:
     """
diff --git a/utils/modular_model_converter.py b/utils/modular_model_converter.py
index d5dc7dfe23b6..f1e887dedf44 100644
--- a/utils/modular_model_converter.py
+++ b/utils/modular_model_converter.py
@@ -1303,6 +1303,35 @@ def _code(node: cst.CSTNode) -> str:
     return other_imports + result
 
 
+def replace_unprotected_image_processing_imports(files: dict, all_imports: list) -> dict:
+    """
+    Because `image_processing` file uses non-protected torchvision and torch imports, we need to duplicate the nodes
+    inside `image_processing_pil` instead of importing them directly from `.image_processing_xxx`, which would crash if
+    torchvision is not installed.
+    """
+    if not ("image_processing" in files and "image_processing_pil" in files):
+        return files
+
+    body = files["image_processing_pil"]
+    needed_imports = get_needed_imports(body, all_imports)
+    import_from_image_processing = None
+    for import_node in needed_imports:
+        if isinstance(import_node, cst.SimpleStatementLine) and isinstance(import_node.body[0], cst.ImportFrom):
+            import_node = import_node.body[0]
+            full_name = get_full_attribute_name(import_node.module)
+            if re.search(r"^image_processing_(?!(?:backends)|(?:utils))", full_name):
+                import_from_image_processing = import_node
+                break
+
+    if import_from_image_processing is None:
+        return files
+
+    imported_objects = [x.name.value for x in import_from_image_processing.names]
+    # Add the nodes inside the body of `image_processing_pil`
+    body.update({name: files["image_processing"][name] for name in imported_objects})
+    return files
+
+
 def split_all_assignment(node: cst.CSTNode, model_name: str) -> dict[str, cst.CSTNode]:
     """Split the `__all__` assignment found in the modular between each corresponding files."""
     all_all_per_file = {}
@@ -1845,6 +1874,11 @@ def create_modules(
         all_imports.extend(new_imports)
         all_imports_code.update(new_imports_code)
 
+    # Because `image_processing` file uses non-protected torchvision and torch imports, we need to duplicate the nodes
+    # here instead of importing from `.image_processing_model`, which would crash if torchvision is not installed
+    if "image_processing" in files and "image_processing_pil" in files:
+        files = replace_unprotected_image_processing_imports(files, all_imports)
+
     # Find the correct imports, and write the new modules
     for file, body in files.items():
         new_body = [k[1]["node"] for k in sorted(body.items(), key=lambda x: x[1]["insert_idx"])]

From bac493376b9edfa3fe93b2c4aea5c291625a0a5c Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Tue, 21 Apr 2026 18:27:55 +0900
Subject: [PATCH 7/9] fix order

---
 .../image_processing_pil_deepseek_vl.py       | 20 +++----
 ...image_processing_pil_deepseek_vl_hybrid.py | 52 +++++++++----------
 .../image_processing_pil_llava_onevision.py   | 22 ++++----
 .../image_processing_pil_segformer.py         | 22 ++++----
 utils/modular_model_converter.py              | 14 ++++-
 5 files changed, 71 insertions(+), 59 deletions(-)

diff --git a/src/transformers/models/deepseek_vl/image_processing_pil_deepseek_vl.py b/src/transformers/models/deepseek_vl/image_processing_pil_deepseek_vl.py
index 6e2a220e3fd2..e868830b0220 100644
--- a/src/transformers/models/deepseek_vl/image_processing_pil_deepseek_vl.py
+++ b/src/transformers/models/deepseek_vl/image_processing_pil_deepseek_vl.py
@@ -36,6 +36,16 @@
 from ...utils import TensorType, auto_docstring
 
 
+class DeepseekVLImageProcessorKwargs(ImagesKwargs, total=False):
+    r"""
+    min_size (`int`, *optional*, defaults to 14):
+        The minimum allowed size for the resized image. Ensures that neither the height nor width
+        falls below this value after resizing.
+    """
+
+    min_size: int
+
+
 @auto_docstring
 class DeepseekVLImageProcessorPil(PilBackend):
     resample = PILImageResampling.BICUBIC
@@ -161,14 +171,4 @@ def postprocess(self):
         raise AttributeError("Not needed for DeepseekVL")
 
 
-class DeepseekVLImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    min_size (`int`, *optional*, defaults to 14):
-        The minimum allowed size for the resized image. Ensures that neither the height nor width
-        falls below this value after resizing.
-    """
-
-    min_size: int
-
-
 __all__ = ["DeepseekVLImageProcessorPil"]
diff --git a/src/transformers/models/deepseek_vl_hybrid/image_processing_pil_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/image_processing_pil_deepseek_vl_hybrid.py
index b1ea56d48a46..55573c35c423 100644
--- a/src/transformers/models/deepseek_vl_hybrid/image_processing_pil_deepseek_vl_hybrid.py
+++ b/src/transformers/models/deepseek_vl_hybrid/image_processing_pil_deepseek_vl_hybrid.py
@@ -38,6 +38,32 @@
 from ...utils import TensorType, auto_docstring
 
 
+class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs, total=False):
+    r"""
+    min_size (`int`, *optional*, defaults to 14):
+        The minimum allowed size for the resized image. Ensures that neither the height nor width
+        falls below this value after resizing.
+     high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`):
+        Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess`
+        method.
+    high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+        Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
+        overridden by the `high_res_resample` parameter in the `preprocess` method.
+    high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
+        Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of
+        channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method.
+    high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`):
+        Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the
+        number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
+    """
+
+    min_size: int
+    high_res_size: dict
+    high_res_resample: Union["PILImageResampling", int]
+    high_res_image_mean: float | list[float] | tuple[float, ...]
+    high_res_image_std: float | list[float] | tuple[float, ...]
+
+
 @auto_docstring
 class DeepseekVLHybridImageProcessorPil(PilBackend):
     resample = PILImageResampling.BICUBIC
@@ -232,30 +258,4 @@ def _standardize_kwargs(
         return kwargs
 
 
-class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    min_size (`int`, *optional*, defaults to 14):
-        The minimum allowed size for the resized image. Ensures that neither the height nor width
-        falls below this value after resizing.
-     high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`):
-        Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess`
-        method.
-    high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
-        Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
-        overridden by the `high_res_resample` parameter in the `preprocess` method.
-    high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
-        Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of
-        channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method.
-    high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`):
-        Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the
-        number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
-    """
-
-    min_size: int
-    high_res_size: dict
-    high_res_resample: Union["PILImageResampling", int]
-    high_res_image_mean: float | list[float] | tuple[float, ...]
-    high_res_image_std: float | list[float] | tuple[float, ...]
-
-
 __all__ = ["DeepseekVLHybridImageProcessorPil"]
diff --git a/src/transformers/models/llava_onevision/image_processing_pil_llava_onevision.py b/src/transformers/models/llava_onevision/image_processing_pil_llava_onevision.py
index 96a973ead67d..23534a65d70f 100644
--- a/src/transformers/models/llava_onevision/image_processing_pil_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/image_processing_pil_llava_onevision.py
@@ -36,6 +36,17 @@
 from ...utils import TensorType, auto_docstring
 
 
+class LlavaOnevisionImageProcessorKwargs(ImagesKwargs, total=False):
+    r"""
+    image_grid_pinpoints (`list[list[int]]`, *optional*):
+        A list of possible resolutions to use for processing high resolution images. The best resolution is selected
+        based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess`
+        method.
+    """
+
+    image_grid_pinpoints: list[list[int]]
+
+
 @auto_docstring
 class LlavaOnevisionImageProcessorPil(PilBackend):
     model_input_names = ["pixel_values", "image_sizes", "batch_num_images"]
@@ -293,15 +304,4 @@ def pad_to_square(
         return result
 
 
-class LlavaOnevisionImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    image_grid_pinpoints (`list[list[int]]`, *optional*):
-        A list of possible resolutions to use for processing high resolution images. The best resolution is selected
-        based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess`
-        method.
-    """
-
-    image_grid_pinpoints: list[list[int]]
-
-
 __all__ = ["LlavaOnevisionImageProcessorPil"]
diff --git a/src/transformers/models/segformer/image_processing_pil_segformer.py b/src/transformers/models/segformer/image_processing_pil_segformer.py
index 7bffa8ab490f..f1d0bb0f627b 100644
--- a/src/transformers/models/segformer/image_processing_pil_segformer.py
+++ b/src/transformers/models/segformer/image_processing_pil_segformer.py
@@ -42,6 +42,17 @@
     import torchvision.transforms.v2.functional as tvF
 
 
+class SegformerImageProcessorKwargs(ImagesKwargs, total=False):
+    r"""
+    do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
+        Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
+        is used for background, and background itself is not included in all classes of a dataset (e.g.
+        ADE20k). The background label will be replaced by 255.
+    """
+
+    do_reduce_labels: bool
+
+
 @requires(backends=("torch", "torchvision"))
 class SegformerImageProcessorPil(PilBackend):
     """PIL backend for Segformer with reduce_label support."""
@@ -209,15 +220,4 @@ def post_process_semantic_segmentation(self, outputs, target_sizes: list[tuple]
         return semantic_segmentation
 
 
-class SegformerImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
-        Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
-        is used for background, and background itself is not included in all classes of a dataset (e.g.
-        ADE20k). The background label will be replaced by 255.
-    """
-
-    do_reduce_labels: bool
-
-
 __all__ = ["SegformerImageProcessorPil"]
diff --git a/utils/modular_model_converter.py b/utils/modular_model_converter.py
index f1e887dedf44..4ef6fe374e7a 100644
--- a/utils/modular_model_converter.py
+++ b/utils/modular_model_converter.py
@@ -1327,8 +1327,20 @@ def replace_unprotected_image_processing_imports(files: dict, all_imports: list)
         return files
 
     imported_objects = [x.name.value for x in import_from_image_processing.names]
+    nodes_to_add = {name: files["image_processing"][name] for name in imported_objects}
+    # Update the position inside the final file
+    for name, node_structure in nodes_to_add.items():
+        node_with_same_index = next(
+            v["node"] for v in body.values() if v["insert_idx"] == node_structure["insert_idx"]
+        )
+        # Insert the new node before the corresponding node if the corresponding node is a class
+        if isinstance(node_with_same_index, cst.ClassDef):
+            nodes_to_add[name]["insert_idx"] -= 0.5
+        # Otherwise, after it
+        else:
+            nodes_to_add[name]["insert_idx"] += 0.5
     # Add the nodes inside the body of `image_processing_pil`
-    body.update({name: files["image_processing"][name] for name in imported_objects})
+    body.update(nodes_to_add)
     return files
 
 

From 58ed72b7da9e2a06385c4480f9457a64f45e2440 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Tue, 21 Apr 2026 18:34:38 +0900
Subject: [PATCH 8/9] fix order

---
 .../image_processing_pil_efficientloftr.py    | 12 ++---
 .../image_processing_pil_ernie4_5_vl_moe.py   | 30 +++++------
 .../glm46v/image_processing_pil_glm46v.py     | 30 +++++------
 .../image_processing_pil_glm_image.py         | 42 +++++++--------
 .../image_processing_pil_lightglue.py         | 12 ++---
 .../image_processing_pil_mask2former.py       | 52 +++++++++----------
 .../image_processing_pil_paddleocr_vl.py      | 34 ++++++------
 .../smolvlm/image_processing_pil_smolvlm.py   | 16 +++---
 .../image_processing_pil_video_llama_3.py     | 42 +++++++--------
 utils/modular_model_converter.py              |  4 +-
 10 files changed, 137 insertions(+), 137 deletions(-)

diff --git a/src/transformers/models/efficientloftr/image_processing_pil_efficientloftr.py b/src/transformers/models/efficientloftr/image_processing_pil_efficientloftr.py
index 7c42d75f2baa..5f467c56dd4f 100644
--- a/src/transformers/models/efficientloftr/image_processing_pil_efficientloftr.py
+++ b/src/transformers/models/efficientloftr/image_processing_pil_efficientloftr.py
@@ -32,12 +32,6 @@
     import torch
 
 
-def is_grayscale(image: np.ndarray):
-    if image.shape[0] == 1:
-        return True
-    return np.all(image[0, ...] == image[1, ...]) and np.all(image[1, ...] == image[2, ...])
-
-
 class EfficientLoFTRImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     do_grayscale (`bool`, *optional*, defaults to `self.do_grayscale`):
@@ -47,6 +41,12 @@ class EfficientLoFTRImageProcessorKwargs(ImagesKwargs, total=False):
     do_grayscale: bool
 
 
+def is_grayscale(image: np.ndarray):
+    if image.shape[0] == 1:
+        return True
+    return np.all(image[0, ...] == image[1, ...]) and np.all(image[1, ...] == image[2, ...])
+
+
 def convert_to_grayscale(image: ImageInput) -> ImageInput:
     """
     Converts an image to grayscale format using the NTSC formula. Only support numpy and PIL Image.
diff --git a/src/transformers/models/ernie4_5_vl_moe/image_processing_pil_ernie4_5_vl_moe.py b/src/transformers/models/ernie4_5_vl_moe/image_processing_pil_ernie4_5_vl_moe.py
index 4b6db850f8da..7f372c3af02d 100644
--- a/src/transformers/models/ernie4_5_vl_moe/image_processing_pil_ernie4_5_vl_moe.py
+++ b/src/transformers/models/ernie4_5_vl_moe/image_processing_pil_ernie4_5_vl_moe.py
@@ -32,6 +32,21 @@
 logger = logging.get_logger(__name__)
 
 
+class Ernie4_5_VLMoeImageProcessorKwargs(ImagesKwargs, total=False):
+    r"""
+    patch_size (`int`, *optional*, defaults to 14):
+        The spatial patch size of the vision encoder.
+    temporal_patch_size (`int`, *optional*):
+        The temporal patch size of the vision encoder. Unused in the image processor, only used for videos.
+    merge_size (`int`, *optional*, defaults to 2):
+        The merge size of the vision encoder to llm encoder.
+    """
+
+    patch_size: int
+    temporal_patch_size: int
+    merge_size: int
+
+
 def smart_resize(
     height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 1280
 ):
@@ -61,21 +76,6 @@ def smart_resize(
     return h_bar, w_bar
 
 
-class Ernie4_5_VLMoeImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    patch_size (`int`, *optional*, defaults to 14):
-        The spatial patch size of the vision encoder.
-    temporal_patch_size (`int`, *optional*):
-        The temporal patch size of the vision encoder. Unused in the image processor, only used for videos.
-    merge_size (`int`, *optional*, defaults to 2):
-        The merge size of the vision encoder to llm encoder.
-    """
-
-    patch_size: int
-    temporal_patch_size: int
-    merge_size: int
-
-
 @auto_docstring
 class Ernie4_5_VLMoeImageProcessorPil(PilBackend):
     do_resize = True
diff --git a/src/transformers/models/glm46v/image_processing_pil_glm46v.py b/src/transformers/models/glm46v/image_processing_pil_glm46v.py
index 5070535f6ecf..5601e732c2b3 100644
--- a/src/transformers/models/glm46v/image_processing_pil_glm46v.py
+++ b/src/transformers/models/glm46v/image_processing_pil_glm46v.py
@@ -30,6 +30,21 @@
 from ...utils import TensorType, auto_docstring
 
 
+class Glm46VImageProcessorKwargs(ImagesKwargs, total=False):
+    """
+    patch_size (`int`, *optional*, defaults to 14):
+        The spatial patch size of the vision encoder.
+    temporal_patch_size (`int`, *optional*, defaults to 2):
+        The temporal patch size of the vision encoder.
+    merge_size (`int`, *optional*, defaults to 2):
+        The merge size of the vision encoder to llm encoder.
+    """
+
+    patch_size: int
+    temporal_patch_size: int
+    merge_size: int
+
+
 # Adapted from transformers.models.glm46v.image_processing_glm46v.smart_resize
 def smart_resize(
     num_frames: int,
@@ -67,21 +82,6 @@ def smart_resize(
     return h_bar, w_bar
 
 
-class Glm46VImageProcessorKwargs(ImagesKwargs, total=False):
-    """
-    patch_size (`int`, *optional*, defaults to 14):
-        The spatial patch size of the vision encoder.
-    temporal_patch_size (`int`, *optional*, defaults to 2):
-        The temporal patch size of the vision encoder.
-    merge_size (`int`, *optional*, defaults to 2):
-        The merge size of the vision encoder to llm encoder.
-    """
-
-    patch_size: int
-    temporal_patch_size: int
-    merge_size: int
-
-
 @auto_docstring
 class Glm46VImageProcessorPil(PilBackend):
     do_resize = True
diff --git a/src/transformers/models/glm_image/image_processing_pil_glm_image.py b/src/transformers/models/glm_image/image_processing_pil_glm_image.py
index 7861cc32a1ae..355bb04adb67 100644
--- a/src/transformers/models/glm_image/image_processing_pil_glm_image.py
+++ b/src/transformers/models/glm_image/image_processing_pil_glm_image.py
@@ -30,6 +30,27 @@
 from ...utils import TensorType, auto_docstring
 
 
+class GlmImageImageProcessorKwargs(ImagesKwargs, total=False):
+    r"""
+    min_pixels (`int`, *optional*, defaults to `56 * 56`):
+        The min pixels of the image to resize the image.
+    max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
+        The max pixels of the image to resize the image.
+    patch_size (`int`, *optional*, defaults to 14):
+        The spatial patch size of the vision encoder.
+    temporal_patch_size (`int`, *optional*, defaults to 2):
+        The temporal patch size of the vision encoder.
+    merge_size (`int`, *optional*, defaults to 2):
+        The merge size of the vision encoder to llm encoder.
+    """
+
+    min_pixels: int
+    max_pixels: int
+    patch_size: int
+    temporal_patch_size: int
+    merge_size: int
+
+
 def smart_resize(
     height: int,
     width: int,
@@ -71,27 +92,6 @@ def smart_resize(
     return h_bar, w_bar
 
 
-class GlmImageImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    min_pixels (`int`, *optional*, defaults to `56 * 56`):
-        The min pixels of the image to resize the image.
-    max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
-        The max pixels of the image to resize the image.
-    patch_size (`int`, *optional*, defaults to 14):
-        The spatial patch size of the vision encoder.
-    temporal_patch_size (`int`, *optional*, defaults to 2):
-        The temporal patch size of the vision encoder.
-    merge_size (`int`, *optional*, defaults to 2):
-        The merge size of the vision encoder to llm encoder.
-    """
-
-    min_pixels: int
-    max_pixels: int
-    patch_size: int
-    temporal_patch_size: int
-    merge_size: int
-
-
 @auto_docstring
 class GlmImageImageProcessorPil(PilBackend):
     do_resize = True
diff --git a/src/transformers/models/lightglue/image_processing_pil_lightglue.py b/src/transformers/models/lightglue/image_processing_pil_lightglue.py
index 77389f8e8da3..9f43fe1bbc7a 100644
--- a/src/transformers/models/lightglue/image_processing_pil_lightglue.py
+++ b/src/transformers/models/lightglue/image_processing_pil_lightglue.py
@@ -46,12 +46,6 @@
     import torch
 
 
-def is_grayscale(image: np.ndarray):
-    if image.shape[0] == 1:
-        return True
-    return np.all(image[0, ...] == image[1, ...]) and np.all(image[1, ...] == image[2, ...])
-
-
 class LightGlueImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     do_grayscale (`bool`, *optional*, defaults to `self.do_grayscale`):
@@ -61,6 +55,12 @@ class LightGlueImageProcessorKwargs(ImagesKwargs, total=False):
     do_grayscale: bool
 
 
+def is_grayscale(image: np.ndarray):
+    if image.shape[0] == 1:
+        return True
+    return np.all(image[0, ...] == image[1, ...]) and np.all(image[1, ...] == image[2, ...])
+
+
 def convert_to_grayscale(image: ImageInput) -> ImageInput:
     """
     Converts an image to grayscale format using the NTSC formula. Only support numpy and PIL Image.
diff --git a/src/transformers/models/mask2former/image_processing_pil_mask2former.py b/src/transformers/models/mask2former/image_processing_pil_mask2former.py
index 6b27657b3677..8358a3601bed 100644
--- a/src/transformers/models/mask2former/image_processing_pil_mask2former.py
+++ b/src/transformers/models/mask2former/image_processing_pil_mask2former.py
@@ -51,6 +51,32 @@
 logger = logging.get_logger(__name__)
 
 
+class Mask2FormerImageProcessorKwargs(ImagesKwargs, total=False):
+    r"""
+    ignore_index (`int`, *optional*):
+        Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
+        denoted with 0 (background) will be replaced with `ignore_index`.
+    do_reduce_labels (`bool`, *optional*, defaults to `False`):
+        Whether or not to decrement all label values of segmentation maps by 1. Usually used for datasets where 0
+        is used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k).
+        The background label will be replaced by `ignore_index`.
+    num_labels (`int`, *optional*):
+        The number of labels in the segmentation map.
+    size_divisor (`int`, *optional*, defaults to `32`):
+        Some backbones need images divisible by a certain number. If not passed, it defaults to the value used in
+        Swin Transformer.
+    pad_size (`SizeDict`, *optional*):
+        The size to pad the images to. Must be larger than any image size provided for preprocessing. If `pad_size`
+        is not provided, images will be padded to the largest height and width in the batch.
+    """
+
+    ignore_index: int | None
+    do_reduce_labels: bool
+    num_labels: int | None
+    size_divisor: int
+    pad_size: SizeDict | None
+
+
 def convert_segmentation_map_to_binary_masks(
     segmentation_map: np.ndarray,
     instance_id_to_semantic_id: dict[int, int] | None = None,
@@ -87,32 +113,6 @@ def convert_segmentation_map_to_binary_masks(
     return binary_masks.astype(np.float32), labels
 
 
-class Mask2FormerImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    ignore_index (`int`, *optional*):
-        Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
-        denoted with 0 (background) will be replaced with `ignore_index`.
-    do_reduce_labels (`bool`, *optional*, defaults to `False`):
-        Whether or not to decrement all label values of segmentation maps by 1. Usually used for datasets where 0
-        is used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k).
-        The background label will be replaced by `ignore_index`.
-    num_labels (`int`, *optional*):
-        The number of labels in the segmentation map.
-    size_divisor (`int`, *optional*, defaults to `32`):
-        Some backbones need images divisible by a certain number. If not passed, it defaults to the value used in
-        Swin Transformer.
-    pad_size (`SizeDict`, *optional*):
-        The size to pad the images to. Must be larger than any image size provided for preprocessing. If `pad_size`
-        is not provided, images will be padded to the largest height and width in the batch.
-    """
-
-    ignore_index: int | None
-    do_reduce_labels: bool
-    num_labels: int | None
-    size_divisor: int
-    pad_size: SizeDict | None
-
-
 # Adapted from transformers.models.mask2former.image_processing_mask2former.binary_mask_to_rle
 def binary_mask_to_rle(mask):
     """
diff --git a/src/transformers/models/paddleocr_vl/image_processing_pil_paddleocr_vl.py b/src/transformers/models/paddleocr_vl/image_processing_pil_paddleocr_vl.py
index c524acc0debc..ac639892640f 100644
--- a/src/transformers/models/paddleocr_vl/image_processing_pil_paddleocr_vl.py
+++ b/src/transformers/models/paddleocr_vl/image_processing_pil_paddleocr_vl.py
@@ -35,6 +35,23 @@
 from ...utils import TensorType, auto_docstring
 
 
+class PaddleOCRVLImageProcessorKwargs(ImagesKwargs, total=False):
+    r"""
+    patch_size (`int`, *optional*, defaults to 14):
+        The spatial patch size of the vision encoder.
+    temporal_patch_size (`int`, *optional*, defaults to 1):
+        The temporal patch size of the vision encoder.
+    merge_size (`int`, *optional*, defaults to 2):
+        The merge size of the vision encoder to llm encoder.
+    """
+
+    min_pixels: int
+    max_pixels: int
+    patch_size: int
+    temporal_patch_size: int
+    merge_size: int
+
+
 def smart_resize(
     height: int,
     width: int,
@@ -67,23 +84,6 @@ def smart_resize(
     return h_bar, w_bar
 
 
-class PaddleOCRVLImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    patch_size (`int`, *optional*, defaults to 14):
-        The spatial patch size of the vision encoder.
-    temporal_patch_size (`int`, *optional*, defaults to 1):
-        The temporal patch size of the vision encoder.
-    merge_size (`int`, *optional*, defaults to 2):
-        The merge size of the vision encoder to llm encoder.
-    """
-
-    min_pixels: int
-    max_pixels: int
-    patch_size: int
-    temporal_patch_size: int
-    merge_size: int
-
-
 @auto_docstring
 class PaddleOCRVLImageProcessorPil(PilBackend):
     do_resize = True
diff --git a/src/transformers/models/smolvlm/image_processing_pil_smolvlm.py b/src/transformers/models/smolvlm/image_processing_pil_smolvlm.py
index dea8fad98b32..3d53ed09c11f 100644
--- a/src/transformers/models/smolvlm/image_processing_pil_smolvlm.py
+++ b/src/transformers/models/smolvlm/image_processing_pil_smolvlm.py
@@ -39,14 +39,6 @@
 from ...utils import TensorType, auto_docstring
 
 
-def _make_pixel_mask(image: np.ndarray, output_size: tuple[int, int]) -> np.ndarray:
-    """Make pixel mask: 1=valid, 0=padding. Images are CHW."""
-    h, w = image.shape[-2:]
-    mask = np.zeros(output_size, dtype=np.int64)
-    mask[:h, :w] = 1
-    return mask
-
-
 class SmolVLMImageProcessorKwargs(ImagesKwargs, total=False):
     """
     do_image_splitting (`bool`, *optional*, defaults to `True`):
@@ -63,6 +55,14 @@ class SmolVLMImageProcessorKwargs(ImagesKwargs, total=False):
     return_row_col_info: bool
 
 
+def _make_pixel_mask(image: np.ndarray, output_size: tuple[int, int]) -> np.ndarray:
+    """Make pixel mask: 1=valid, 0=padding. Images are CHW."""
+    h, w = image.shape[-2:]
+    mask = np.zeros(output_size, dtype=np.int64)
+    mask[:h, :w] = 1
+    return mask
+
+
 # Adapted from transformers.models.smolvlm.image_processing_smolvlm.MAX_IMAGE_SIZE
 MAX_IMAGE_SIZE = 4096  # 4k resolution as absolute maximum
 
diff --git a/src/transformers/models/video_llama_3/image_processing_pil_video_llama_3.py b/src/transformers/models/video_llama_3/image_processing_pil_video_llama_3.py
index a48e79e09936..5272c7465b2b 100644
--- a/src/transformers/models/video_llama_3/image_processing_pil_video_llama_3.py
+++ b/src/transformers/models/video_llama_3/image_processing_pil_video_llama_3.py
@@ -30,6 +30,27 @@
 from ...utils import TensorType, auto_docstring
 
 
+class VideoLlama3ImageProcessorKwargs(ImagesKwargs, total=False):
+    r"""
+    min_pixels (`int`, *optional*, defaults to `56 * 56`):
+        The min pixels of the image to resize the image.
+    max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
+        The max pixels of the image to resize the image.
+    patch_size (`int`, *optional*, defaults to 14):
+        The spatial patch size of the vision encoder.
+    temporal_patch_size (`int`, *optional*, defaults to 2):
+        The temporal patch size of the vision encoder.
+    merge_size (`int`, *optional*, defaults to 2):
+        The merge size of the vision encoder to llm encoder.
+    """
+
+    min_pixels: int
+    max_pixels: int
+    patch_size: int
+    temporal_patch_size: int
+    merge_size: int
+
+
 def smart_resize(
     height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 1280
 ):
@@ -59,27 +80,6 @@ def smart_resize(
     return h_bar, w_bar
 
 
-class VideoLlama3ImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    min_pixels (`int`, *optional*, defaults to `56 * 56`):
-        The min pixels of the image to resize the image.
-    max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
-        The max pixels of the image to resize the image.
-    patch_size (`int`, *optional*, defaults to 14):
-        The spatial patch size of the vision encoder.
-    temporal_patch_size (`int`, *optional*, defaults to 2):
-        The temporal patch size of the vision encoder.
-    merge_size (`int`, *optional*, defaults to 2):
-        The merge size of the vision encoder to llm encoder.
-    """
-
-    min_pixels: int
-    max_pixels: int
-    patch_size: int
-    temporal_patch_size: int
-    merge_size: int
-
-
 @auto_docstring
 class VideoLlama3ImageProcessorPil(PilBackend):
     do_resize = True
diff --git a/utils/modular_model_converter.py b/utils/modular_model_converter.py
index 4ef6fe374e7a..018316680ece 100644
--- a/utils/modular_model_converter.py
+++ b/utils/modular_model_converter.py
@@ -1333,8 +1333,8 @@ def replace_unprotected_image_processing_imports(files: dict, all_imports: list)
         node_with_same_index = next(
             v["node"] for v in body.values() if v["insert_idx"] == node_structure["insert_idx"]
         )
-        # Insert the new node before the corresponding node if the corresponding node is a class
-        if isinstance(node_with_same_index, cst.ClassDef):
+        # Insert the new node before the corresponding node if the corresponding node is a class or function
+        if isinstance(node_with_same_index, (cst.ClassDef, cst.FunctionDef)):
             nodes_to_add[name]["insert_idx"] -= 0.5
         # Otherwise, after it
         else:

From 05e1b5b243cd40348661bfcc3d47e02160e9eca5 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Tue, 21 Apr 2026 18:36:55 +0900
Subject: [PATCH 9/9] revert

---
 src/transformers/dependency_versions_table.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 0456904dd3d5..b08aa558d795 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -56,7 +56,6 @@
     "rjieba": "rjieba",
     "rouge-score": "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1",
     "ruff": "ruff==0.14.10",
-    "transformers-mlinter": "transformers-mlinter @ git+https://github.com/huggingface/transformers-mlinter@b9d319ce264c106f97a959d926ef42bc3c0ea4d1",
     "ty": "ty==0.0.20",
     "sacrebleu": "sacrebleu>=1.4.12,<2.0.0",
     "sacremoses": "sacremoses",