huggingface · Cyrilvallez · Apr 22, 2026 · Apr 21, 2026 · Apr 21, 2026 · Apr 21, 2026
diff --git a/src/transformers/models/conditional_detr/image_processing_pil_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_pil_conditional_detr.py
@@ -61,6 +61,8 @@
 
 logger = logging.get_logger(__name__)
 
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
+
 
 class ConditionalDetrImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
@@ -76,9 +78,6 @@ class ConditionalDetrImageProcessorKwargs(ImagesKwargs, total=False):
     do_convert_annotations: bool
 
 
-SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
-
-
 # inspired by https://github.com/facebookresearch/conditional_detr/blob/master/datasets/coco.py#L33
 def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray:
     """

diff --git a/src/transformers/models/conditional_detr/modular_conditional_detr.py b/src/transformers/models/conditional_detr/modular_conditional_detr.py
@@ -20,13 +20,12 @@
 from ...image_transforms import (
     center_to_corners_format,
 )
-from ...image_utils import AnnotationFormat
 from ...masking_utils import create_bidirectional_mask
 from ...modeling_outputs import (
     BaseModelOutput,
 )
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
-from ...processing_utils import ImagesKwargs, Unpack
+from ...processing_utils import Unpack
 from ...utils import (
     TensorType,
     TransformersKwargs,
@@ -66,20 +65,6 @@
 logger = logging.get_logger(__name__)
 
 
-class ConditionalDetrImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
-        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
-    do_convert_annotations (`bool`, *optional*, defaults to `True`):
-        Controls whether to convert the annotations to the format expected by the CONDITIONAL_DETR model. Converts the
-        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
-        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
-    """
-
-    format: str | AnnotationFormat
-    do_convert_annotations: bool
-
-
 class ConditionalDetrImageProcessor(DetrImageProcessor):
     def post_process_object_detection(
         self, outputs, threshold: float = 0.5, target_sizes: TensorType | list[tuple] = None, top_k: int = 100

diff --git a/src/transformers/models/deepseek_vl/modular_deepseek_vl.py b/src/transformers/models/deepseek_vl/modular_deepseek_vl.py
@@ -20,7 +20,7 @@
 from ...configuration_utils import PreTrainedConfig
 from ...image_processing_utils import BatchFeature
 from ...image_utils import ImageInput
-from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import (
     PreTokenizedInput,
     TextInput,
@@ -152,16 +152,6 @@ def generate(self):
         raise AttributeError("Not needed for DeepseekVL")
 
 
-class DeepseekVLImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    min_size (`int`, *optional*, defaults to 14):
-        The minimum allowed size for the resized image. Ensures that neither the height nor width
-        falls below this value after resizing.
-    """
-
-    min_size: int
-
-
 class DeepseekVLImageProcessorPil(JanusImageProcessorPil):
     def postprocess(self):
         raise AttributeError("Not needed for DeepseekVL")

diff --git a/src/transformers/models/deformable_detr/image_processing_pil_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_pil_deformable_detr.py
@@ -57,6 +57,8 @@
 if is_torch_available():
     import torch
 
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
+
 
 class DeformableDetrImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
@@ -72,9 +74,6 @@ class DeformableDetrImageProcessorKwargs(ImagesKwargs, total=False):
     do_convert_annotations: bool
 
 
-SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
-
-
 # inspired by https://github.com/facebookresearch/deformable_detr/blob/master/datasets/coco.py#L33
 def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray:
     """

diff --git a/src/transformers/models/deformable_detr/modular_deformable_detr.py b/src/transformers/models/deformable_detr/modular_deformable_detr.py
@@ -23,11 +23,10 @@
 from ... import initialization as init
 from ...backbone_utils import load_backbone
 from ...image_transforms import center_to_corners_format
-from ...image_utils import AnnotationFormat
 from ...integrations import use_kernel_forward_from_hub
 from ...modeling_outputs import BaseModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...processing_utils import ImagesKwargs, Unpack
+from ...processing_utils import Unpack
 from ...utils import (
     ModelOutput,
     TensorType,
@@ -61,20 +60,6 @@
 logger = logging.get_logger(__name__)
 
 
-class DeformableDetrImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
-        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
-    do_convert_annotations (`bool`, *optional*, defaults to `True`):
-        Controls whether to convert the annotations to the format expected by the DEFORMABLE_DETR model. Converts the
-        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
-        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
-    """
-
-    format: str | AnnotationFormat
-    do_convert_annotations: bool
-
-
 class DeformableDetrImageProcessor(DetrImageProcessor):
     def post_process_object_detection(
         self, outputs, threshold: float = 0.5, target_sizes: TensorType | list[tuple] = None, top_k: int = 100

diff --git a/src/transformers/models/efficientloftr/modular_efficientloftr.py b/src/transformers/models/efficientloftr/modular_efficientloftr.py
@@ -1,6 +1,5 @@
 from typing import TYPE_CHECKING
 
-from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, is_torch_available
 from ...utils.import_utils import requires
 from ..superglue.image_processing_pil_superglue import SuperGlueImageProcessorPil
@@ -14,15 +13,6 @@
     from .modeling_efficientloftr import EfficientLoFTRKeypointMatchingOutput
 
 
-class EfficientLoFTRImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    do_grayscale (`bool`, *optional*, defaults to `self.do_grayscale`):
-        Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
-    """
-
-    do_grayscale: bool
-
-
 class EfficientLoFTRImageProcessor(SuperGlueImageProcessor):
     def post_process_keypoint_matching(
         self,

diff --git a/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py b/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py
@@ -43,7 +43,7 @@
 from ...modeling_outputs import BaseModelOutputWithPooling, MoeCausalLMOutputWithPast, MoeModelOutputWithPast
 from ...modeling_rope_utils import dynamic_rope_update
 from ...modeling_utils import PreTrainedModel
-from ...processing_utils import ImagesKwargs, Unpack
+from ...processing_utils import Unpack
 from ...utils import (
     TensorType,
     TransformersKwargs,
@@ -63,7 +63,7 @@
     Ernie4_5_MoeStatics,
     Ernie4_5_MoeTopKRouter,
 )
-from ..glm4v.image_processing_glm4v import Glm4vImageProcessor
+from ..glm4v.image_processing_glm4v import Glm4vImageProcessor, Glm4vImageProcessorKwargs
 from ..glm4v.image_processing_pil_glm4v import Glm4vImageProcessorPil
 from ..glm4v.modeling_glm4v import Glm4vForConditionalGeneration
 from ..mixtral.modeling_mixtral import load_balancing_loss_func
@@ -1220,7 +1220,7 @@ def forward(
         )
 
 
-class Ernie4_5_VLMoeImageProcessorKwargs(ImagesKwargs, total=False):
+class Ernie4_5_VLMoeImageProcessorKwargs(Glm4vImageProcessorKwargs):
     r"""
     patch_size (`int`, *optional*, defaults to 14):
         The spatial patch size of the vision encoder.
@@ -1230,10 +1230,6 @@ class Ernie4_5_VLMoeImageProcessorKwargs(ImagesKwargs, total=False):
         The merge size of the vision encoder to llm encoder.
     """
 
-    patch_size: int
-    temporal_patch_size: int
-    merge_size: int
-
 
 class Ernie4_5_VLMoeImageProcessorPil(Glm4vImageProcessorPil):
     size = {"shortest_edge": 56 * 56, "longest_edge": 28 * 28 * 6177}

diff --git a/src/transformers/models/glm_image/image_processing_pil_glm_image.py b/src/transformers/models/glm_image/image_processing_pil_glm_image.py
@@ -30,7 +30,6 @@
 from ...utils import TensorType, auto_docstring
 
 
-# Adapted from transformers.models.glm_image.image_processing_glm_image.GlmImageImageProcessorKwargs
 class GlmImageImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
     min_pixels (`int`, *optional*, defaults to `56 * 56`):

diff --git a/src/transformers/models/grounding_dino/image_processing_pil_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_pil_grounding_dino.py
@@ -67,6 +67,8 @@
 if is_torch_available():
     import torch
 
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
+
 
 class GroundingDinoImageProcessorKwargs(ImagesKwargs, total=False):
     r"""
@@ -82,9 +84,6 @@ class GroundingDinoImageProcessorKwargs(ImagesKwargs, total=False):
     do_convert_annotations: bool
 
 
-SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
-
-
 # inspired by https://github.com/facebookresearch/grounding_dino/blob/master/datasets/coco.py#L33
 def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray:
     """

diff --git a/src/transformers/models/grounding_dino/modular_grounding_dino.py b/src/transformers/models/grounding_dino/modular_grounding_dino.py
@@ -25,8 +25,6 @@
 from transformers.models.detr.image_processing_pil_detr import DetrImageProcessorPil
 
 from ...image_transforms import center_to_corners_format
-from ...image_utils import AnnotationFormat
-from ...processing_utils import ImagesKwargs
 from ...utils import (
     TensorType,
     logging,
@@ -70,20 +68,6 @@ def _scale_boxes(boxes, target_sizes):
     return boxes
 
 
-class GroundingDinoImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
-        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
-    do_convert_annotations (`bool`, *optional*, defaults to `True`):
-        Controls whether to convert the annotations to the format expected by the GROUNDING_DINO model. Converts the
-        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
-        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
-    """
-
-    format: str | AnnotationFormat
-    do_convert_annotations: bool
-
-
 class GroundingDinoImageProcessor(DetrImageProcessor):
     def post_process_object_detection(
         self,

diff --git a/src/transformers/models/lightglue/modular_lightglue.py b/src/transformers/models/lightglue/modular_lightglue.py
@@ -23,7 +23,7 @@
 from ...configuration_utils import PreTrainedConfig
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
-from ...processing_utils import ImagesKwargs, Unpack
+from ...processing_utils import Unpack
 from ...utils import ModelOutput, TensorType, auto_docstring, can_return_tuple, logging
 from ...utils.import_utils import requires
 from ..auto import CONFIG_MAPPING, AutoConfig
@@ -32,7 +32,7 @@
 from ..cohere.modeling_cohere import apply_rotary_pos_emb
 from ..llama.modeling_llama import LlamaAttention, eager_attention_forward
 from ..superglue.image_processing_pil_superglue import SuperGlueImageProcessorPil
-from ..superglue.image_processing_superglue import SuperGlueImageProcessor
+from ..superglue.image_processing_superglue import SuperGlueImageProcessor, SuperGlueImageProcessorKwargs
 from ..superpoint import SuperPointConfig
 
 
@@ -154,13 +154,8 @@ class LightGlueKeypointMatchingOutput(ModelOutput):
     attentions: tuple[torch.FloatTensor] | None = None
 
 
-class LightGlueImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    do_grayscale (`bool`, *optional*, defaults to `self.do_grayscale`):
-        Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
-    """
-
-    do_grayscale: bool
+class LightGlueImageProcessorKwargs(SuperGlueImageProcessorKwargs):
+    pass
 
 
 class LightGlueImageProcessor(SuperGlueImageProcessor):

diff --git a/src/transformers/models/llava_onevision/modular_llava_onevision.py b/src/transformers/models/llava_onevision/modular_llava_onevision.py
@@ -34,7 +34,7 @@
 )
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_outputs import BaseModelOutputWithPooling
-from ...processing_utils import ImagesKwargs, Unpack
+from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring, logging
 from ...utils.generic import can_return_tuple, merge_with_config_defaults
 from ..llava_next.image_processing_llava_next import LlavaNextImageProcessor, LlavaNextImageProcessorKwargs
@@ -217,17 +217,6 @@ def _preprocess(
         )
 
 
-class LlavaOnevisionImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    image_grid_pinpoints (`list[list[int]]`, *optional*):
-        A list of possible resolutions to use for processing high resolution images. The best resolution is selected
-        based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess`
-        method.
-    """
-
-    image_grid_pinpoints: list[list[int]]
-
-
 class LlavaOnevisionImageProcessorPil(LlavaNextImageProcessorPil):
     resample = PILImageResampling.BICUBIC
     image_mean = OPENAI_CLIP_MEAN

diff --git a/src/transformers/models/mask2former/modular_mask2former.py b/src/transformers/models/mask2former/modular_mask2former.py
@@ -15,8 +15,6 @@
 import torch
 from torch import nn
 
-from ...image_utils import SizeDict
-from ...processing_utils import ImagesKwargs
 from ...utils import (
     TensorType,
     logging,
@@ -35,32 +33,6 @@
 logger = logging.get_logger(__name__)
 
 
-class Mask2FormerImageProcessorKwargs(ImagesKwargs, total=False):
-    r"""
-    ignore_index (`int`, *optional*):
-        Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
-        denoted with 0 (background) will be replaced with `ignore_index`.
-    do_reduce_labels (`bool`, *optional*, defaults to `False`):
-        Whether or not to decrement all label values of segmentation maps by 1. Usually used for datasets where 0
-        is used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k).
-        The background label will be replaced by `ignore_index`.
-    num_labels (`int`, *optional*):
-        The number of labels in the segmentation map.
-    size_divisor (`int`, *optional*, defaults to `32`):
-        Some backbones need images divisible by a certain number. If not passed, it defaults to the value used in
-        Swin Transformer.
-    pad_size (`SizeDict`, *optional*):
-        The size to pad the images to. Must be larger than any image size provided for preprocessing. If `pad_size`
-        is not provided, images will be padded to the largest height and width in the batch.
-    """
-
-    ignore_index: int | None
-    do_reduce_labels: bool
-    num_labels: int | None
-    size_divisor: int
-    pad_size: SizeDict | None
-
-
 class Mask2FormerImageProcessor(MaskFormerImageProcessor):
     def post_process_semantic_segmentation(
         self, outputs, target_sizes: list[tuple[int, int]] | None = None

diff --git a/src/transformers/models/paddleocr_vl/modular_paddleocr_vl.py b/src/transformers/models/paddleocr_vl/modular_paddleocr_vl.py
@@ -38,9 +38,8 @@
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPast, BaseModelOutputWithPooling
 from ...modeling_utils import PreTrainedModel
 from ...models.qwen2_vl.image_processing_pil_qwen2_vl import Qwen2VLImageProcessorPil
-from ...models.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor
+from ...models.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor, Qwen2VLImageProcessorKwargs
 from ...processing_utils import (
-    ImagesKwargs,
     ProcessingKwargs,
     ProcessorMixin,
     Unpack,
@@ -123,7 +122,7 @@ def smart_resize(
     return h_bar, w_bar
 
 
-class PaddleOCRVLImageProcessorKwargs(ImagesKwargs, total=False):
+class PaddleOCRVLImageProcessorKwargs(Qwen2VLImageProcessorKwargs):
     r"""
     patch_size (`int`, *optional*, defaults to 14):
         The spatial patch size of the vision encoder.
@@ -133,12 +132,6 @@ class PaddleOCRVLImageProcessorKwargs(ImagesKwargs, total=False):
         The merge size of the vision encoder to llm encoder.
     """
 
-    min_pixels: int
-    max_pixels: int
-    patch_size: int
-    temporal_patch_size: int
-    merge_size: int
-
 
 class PaddleOCRVLImageProcessorPil(Qwen2VLImageProcessorPil):
     size = {"shortest_edge": 384 * 384, "longest_edge": 1536 * 1536}