From ce250d57da19e1ac68f8b56eeb8714d295b5484d Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Tue, 21 Apr 2026 09:23:09 +0900 Subject: [PATCH 1/9] revert modular: changes break modular's purpose --- utils/modular_model_converter.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/utils/modular_model_converter.py b/utils/modular_model_converter.py index 48dc46b8b593..5fd453816f54 100644 --- a/utils/modular_model_converter.py +++ b/utils/modular_model_converter.py @@ -1692,10 +1692,6 @@ class NewNameModel(LlamaModel): class_file_type = find_file_type(class_name, new_name) # In this case, we need to remove it from the dependencies and create a new import instead if class_file_type != file_type: - # image_processing_pil and image_processing must never depend on each other. - # When a PIL class needs an image_processing class, inline it instead of importing. - if file_type == "image_processing_pil" and class_file_type == "image_processing": - continue corrected_dependencies.remove(class_name) import_statement = f"from .{class_file_type}_{new_name} import {class_name}" new_imports[class_name] = cst.parse_statement(import_statement) @@ -1748,14 +1744,7 @@ class node based on the inherited classes if needed. Also returns any new import # Remove all classes explicitly defined in modular from the dependencies. Otherwise, if a class is referenced # before its new modular definition, it may be wrongly imported from elsewhere as a dependency if it matches # another class from a modeling file after renaming, even though it would be added after anyway (leading to duplicates) - # Exception: for image_processing_pil files, image_processing modular classes must be inlined (not excluded), - # because these two files must never import from each other. - classes_to_exclude = set(modular_mapper.classes.keys()) - if file_type == "image_processing_pil": - classes_to_exclude -= { - k for k in classes_to_exclude if find_file_type(k, model_name) == "image_processing" - } - new_node_dependencies -= classes_to_exclude + new_node_dependencies -= set(modular_mapper.classes.keys()) # The node was modified -> look for all recursive dependencies of the new node all_dependencies_to_add = find_all_dependencies( @@ -1790,9 +1779,7 @@ class node based on the inherited classes if needed. Also returns any new import relative_dependency_order = modular_mapper.compute_relative_order(all_dependencies_to_add) nodes_to_add = { - dep: (relative_dependency_order[dep], modular_mapper.global_nodes[dep]) - for dep in all_dependencies_to_add - if dep not in file_to_update + dep: (relative_dependency_order[dep], mapper.global_nodes[dep]) for dep in all_dependencies_to_add } # Add the class node itself to the nodes to add From 25b19dcb681b3b2b53fb051dd3082a4a00f8e1ec Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Tue, 21 Apr 2026 09:40:29 +0900 Subject: [PATCH 2/9] revert the changes to the modulars --- .../modular_conditional_detr.py | 17 +---------- .../models/deepseek_vl/modular_deepseek_vl.py | 12 +------- .../modular_deformable_detr.py | 17 +---------- .../efficientloftr/modular_efficientloftr.py | 10 ------- .../modular_ernie4_5_vl_moe.py | 19 +++---------- .../grounding_dino/modular_grounding_dino.py | 16 ----------- .../models/lightglue/modular_lightglue.py | 13 +++------ .../modular_llava_onevision.py | 13 +-------- .../models/mask2former/modular_mask2former.py | 28 ------------------- .../paddleocr_vl/modular_paddleocr_vl.py | 11 ++------ .../models/rt_detr/modular_rt_detr.py | 14 ---------- .../models/segformer/modular_segformer.py | 12 -------- .../models/smolvlm/modular_smolvlm.py | 18 +----------- .../video_llama_3/modular_video_llama_3.py | 25 +++-------------- 14 files changed, 19 insertions(+), 206 deletions(-) diff --git a/src/transformers/models/conditional_detr/modular_conditional_detr.py b/src/transformers/models/conditional_detr/modular_conditional_detr.py index ffc1e78bee01..2205b85c5547 100644 --- a/src/transformers/models/conditional_detr/modular_conditional_detr.py +++ b/src/transformers/models/conditional_detr/modular_conditional_detr.py @@ -20,13 +20,12 @@ from ...image_transforms import ( center_to_corners_format, ) -from ...image_utils import AnnotationFormat from ...masking_utils import create_bidirectional_mask from ...modeling_outputs import ( BaseModelOutput, ) from ...modeling_utils import ALL_ATTENTION_FUNCTIONS -from ...processing_utils import ImagesKwargs, Unpack +from ...processing_utils import Unpack from ...utils import ( TensorType, TransformersKwargs, @@ -66,20 +65,6 @@ logger = logging.get_logger(__name__) -class ConditionalDetrImageProcessorKwargs(ImagesKwargs, total=False): - r""" - format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): - Data format of the annotations. One of "coco_detection" or "coco_panoptic". - do_convert_annotations (`bool`, *optional*, defaults to `True`): - Controls whether to convert the annotations to the format expected by the CONDITIONAL_DETR model. Converts the - bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. - Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. - """ - - format: str | AnnotationFormat - do_convert_annotations: bool - - class ConditionalDetrImageProcessor(DetrImageProcessor): def post_process_object_detection( self, outputs, threshold: float = 0.5, target_sizes: TensorType | list[tuple] = None, top_k: int = 100 diff --git a/src/transformers/models/deepseek_vl/modular_deepseek_vl.py b/src/transformers/models/deepseek_vl/modular_deepseek_vl.py index a56da6f3fe0a..be955c6fd41e 100644 --- a/src/transformers/models/deepseek_vl/modular_deepseek_vl.py +++ b/src/transformers/models/deepseek_vl/modular_deepseek_vl.py @@ -20,7 +20,7 @@ from ...configuration_utils import PreTrainedConfig from ...image_processing_utils import BatchFeature from ...image_utils import ImageInput -from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack +from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import ( PreTokenizedInput, TextInput, @@ -152,16 +152,6 @@ def generate(self): raise AttributeError("Not needed for DeepseekVL") -class DeepseekVLImageProcessorKwargs(ImagesKwargs, total=False): - r""" - min_size (`int`, *optional*, defaults to 14): - The minimum allowed size for the resized image. Ensures that neither the height nor width - falls below this value after resizing. - """ - - min_size: int - - class DeepseekVLImageProcessorPil(JanusImageProcessorPil): def postprocess(self): raise AttributeError("Not needed for DeepseekVL") diff --git a/src/transformers/models/deformable_detr/modular_deformable_detr.py b/src/transformers/models/deformable_detr/modular_deformable_detr.py index a2f80e8236ad..a4a5b4acd95a 100644 --- a/src/transformers/models/deformable_detr/modular_deformable_detr.py +++ b/src/transformers/models/deformable_detr/modular_deformable_detr.py @@ -23,11 +23,10 @@ from ... import initialization as init from ...backbone_utils import load_backbone from ...image_transforms import center_to_corners_format -from ...image_utils import AnnotationFormat from ...integrations import use_kernel_forward_from_hub from ...modeling_outputs import BaseModelOutput from ...modeling_utils import PreTrainedModel -from ...processing_utils import ImagesKwargs, Unpack +from ...processing_utils import Unpack from ...utils import ( ModelOutput, TensorType, @@ -61,20 +60,6 @@ logger = logging.get_logger(__name__) -class DeformableDetrImageProcessorKwargs(ImagesKwargs, total=False): - r""" - format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): - Data format of the annotations. One of "coco_detection" or "coco_panoptic". - do_convert_annotations (`bool`, *optional*, defaults to `True`): - Controls whether to convert the annotations to the format expected by the DEFORMABLE_DETR model. Converts the - bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. - Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. - """ - - format: str | AnnotationFormat - do_convert_annotations: bool - - class DeformableDetrImageProcessor(DetrImageProcessor): def post_process_object_detection( self, outputs, threshold: float = 0.5, target_sizes: TensorType | list[tuple] = None, top_k: int = 100 diff --git a/src/transformers/models/efficientloftr/modular_efficientloftr.py b/src/transformers/models/efficientloftr/modular_efficientloftr.py index 17e3e399a8df..86d8d34eba70 100644 --- a/src/transformers/models/efficientloftr/modular_efficientloftr.py +++ b/src/transformers/models/efficientloftr/modular_efficientloftr.py @@ -1,6 +1,5 @@ from typing import TYPE_CHECKING -from ...processing_utils import ImagesKwargs from ...utils import TensorType, is_torch_available from ...utils.import_utils import requires from ..superglue.image_processing_pil_superglue import SuperGlueImageProcessorPil @@ -14,15 +13,6 @@ from .modeling_efficientloftr import EfficientLoFTRKeypointMatchingOutput -class EfficientLoFTRImageProcessorKwargs(ImagesKwargs, total=False): - r""" - do_grayscale (`bool`, *optional*, defaults to `self.do_grayscale`): - Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method. - """ - - do_grayscale: bool - - class EfficientLoFTRImageProcessor(SuperGlueImageProcessor): def post_process_keypoint_matching( self, diff --git a/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py b/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py index 42bbb44b70a5..0e9f27d5c41d 100644 --- a/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +++ b/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py @@ -43,7 +43,7 @@ from ...modeling_outputs import BaseModelOutputWithPooling, MoeCausalLMOutputWithPast, MoeModelOutputWithPast from ...modeling_rope_utils import dynamic_rope_update from ...modeling_utils import PreTrainedModel -from ...processing_utils import ImagesKwargs, Unpack +from ...processing_utils import Unpack from ...utils import ( TensorType, TransformersKwargs, @@ -63,7 +63,7 @@ Ernie4_5_MoeStatics, Ernie4_5_MoeTopKRouter, ) -from ..glm4v.image_processing_glm4v import Glm4vImageProcessor +from ..glm4v.image_processing_glm4v import Glm4vImageProcessor, Glm4vImageProcessorKwargs from ..glm4v.image_processing_pil_glm4v import Glm4vImageProcessorPil from ..glm4v.modeling_glm4v import Glm4vForConditionalGeneration from ..mixtral.modeling_mixtral import load_balancing_loss_func @@ -1220,19 +1220,8 @@ def forward( ) -class Ernie4_5_VLMoeImageProcessorKwargs(ImagesKwargs, total=False): - r""" - patch_size (`int`, *optional*, defaults to 14): - The spatial patch size of the vision encoder. - temporal_patch_size (`int`, *optional*): - The temporal patch size of the vision encoder. Unused in the image processor, only used for videos. - merge_size (`int`, *optional*, defaults to 2): - The merge size of the vision encoder to llm encoder. - """ - - patch_size: int - temporal_patch_size: int - merge_size: int +class Ernie4_5_VLMoeImageProcessorKwargs(Glm4vImageProcessorKwargs): + pass class Ernie4_5_VLMoeImageProcessorPil(Glm4vImageProcessorPil): diff --git a/src/transformers/models/grounding_dino/modular_grounding_dino.py b/src/transformers/models/grounding_dino/modular_grounding_dino.py index 483ad262a602..bd35fd512ffe 100644 --- a/src/transformers/models/grounding_dino/modular_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modular_grounding_dino.py @@ -25,8 +25,6 @@ from transformers.models.detr.image_processing_pil_detr import DetrImageProcessorPil from ...image_transforms import center_to_corners_format -from ...image_utils import AnnotationFormat -from ...processing_utils import ImagesKwargs from ...utils import ( TensorType, logging, @@ -70,20 +68,6 @@ def _scale_boxes(boxes, target_sizes): return boxes -class GroundingDinoImageProcessorKwargs(ImagesKwargs, total=False): - r""" - format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): - Data format of the annotations. One of "coco_detection" or "coco_panoptic". - do_convert_annotations (`bool`, *optional*, defaults to `True`): - Controls whether to convert the annotations to the format expected by the GROUNDING_DINO model. Converts the - bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. - Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. - """ - - format: str | AnnotationFormat - do_convert_annotations: bool - - class GroundingDinoImageProcessor(DetrImageProcessor): def post_process_object_detection( self, diff --git a/src/transformers/models/lightglue/modular_lightglue.py b/src/transformers/models/lightglue/modular_lightglue.py index 62082b678b00..afc8a3efec25 100644 --- a/src/transformers/models/lightglue/modular_lightglue.py +++ b/src/transformers/models/lightglue/modular_lightglue.py @@ -23,7 +23,7 @@ from ...configuration_utils import PreTrainedConfig from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel -from ...processing_utils import ImagesKwargs, Unpack +from ...processing_utils import Unpack from ...utils import ModelOutput, TensorType, auto_docstring, can_return_tuple, logging from ...utils.import_utils import requires from ..auto import CONFIG_MAPPING, AutoConfig @@ -32,7 +32,7 @@ from ..cohere.modeling_cohere import apply_rotary_pos_emb from ..llama.modeling_llama import LlamaAttention, eager_attention_forward from ..superglue.image_processing_pil_superglue import SuperGlueImageProcessorPil -from ..superglue.image_processing_superglue import SuperGlueImageProcessor +from ..superglue.image_processing_superglue import SuperGlueImageProcessor, SuperGlueImageProcessorKwargs from ..superpoint import SuperPointConfig @@ -154,13 +154,8 @@ class LightGlueKeypointMatchingOutput(ModelOutput): attentions: tuple[torch.FloatTensor] | None = None -class LightGlueImageProcessorKwargs(ImagesKwargs, total=False): - r""" - do_grayscale (`bool`, *optional*, defaults to `self.do_grayscale`): - Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method. - """ - - do_grayscale: bool +class LightGlueImageProcessorKwargs(SuperGlueImageProcessorKwargs): + pass class LightGlueImageProcessor(SuperGlueImageProcessor): diff --git a/src/transformers/models/llava_onevision/modular_llava_onevision.py b/src/transformers/models/llava_onevision/modular_llava_onevision.py index a3634aa17cba..f44a4612cdc2 100644 --- a/src/transformers/models/llava_onevision/modular_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modular_llava_onevision.py @@ -34,7 +34,7 @@ ) from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_outputs import BaseModelOutputWithPooling -from ...processing_utils import ImagesKwargs, Unpack +from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring, logging from ...utils.generic import can_return_tuple, merge_with_config_defaults from ..llava_next.image_processing_llava_next import LlavaNextImageProcessor, LlavaNextImageProcessorKwargs @@ -217,17 +217,6 @@ def _preprocess( ) -class LlavaOnevisionImageProcessorKwargs(ImagesKwargs, total=False): - r""" - image_grid_pinpoints (`list[list[int]]`, *optional*): - A list of possible resolutions to use for processing high resolution images. The best resolution is selected - based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess` - method. - """ - - image_grid_pinpoints: list[list[int]] - - class LlavaOnevisionImageProcessorPil(LlavaNextImageProcessorPil): resample = PILImageResampling.BICUBIC image_mean = OPENAI_CLIP_MEAN diff --git a/src/transformers/models/mask2former/modular_mask2former.py b/src/transformers/models/mask2former/modular_mask2former.py index 87f2b834991f..089baffe5df7 100644 --- a/src/transformers/models/mask2former/modular_mask2former.py +++ b/src/transformers/models/mask2former/modular_mask2former.py @@ -15,8 +15,6 @@ import torch from torch import nn -from ...image_utils import SizeDict -from ...processing_utils import ImagesKwargs from ...utils import ( TensorType, logging, @@ -35,32 +33,6 @@ logger = logging.get_logger(__name__) -class Mask2FormerImageProcessorKwargs(ImagesKwargs, total=False): - r""" - ignore_index (`int`, *optional*): - Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels - denoted with 0 (background) will be replaced with `ignore_index`. - do_reduce_labels (`bool`, *optional*, defaults to `False`): - Whether or not to decrement all label values of segmentation maps by 1. Usually used for datasets where 0 - is used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k). - The background label will be replaced by `ignore_index`. - num_labels (`int`, *optional*): - The number of labels in the segmentation map. - size_divisor (`int`, *optional*, defaults to `32`): - Some backbones need images divisible by a certain number. If not passed, it defaults to the value used in - Swin Transformer. - pad_size (`SizeDict`, *optional*): - The size to pad the images to. Must be larger than any image size provided for preprocessing. If `pad_size` - is not provided, images will be padded to the largest height and width in the batch. - """ - - ignore_index: int | None - do_reduce_labels: bool - num_labels: int | None - size_divisor: int - pad_size: SizeDict | None - - class Mask2FormerImageProcessor(MaskFormerImageProcessor): def post_process_semantic_segmentation( self, outputs, target_sizes: list[tuple[int, int]] | None = None diff --git a/src/transformers/models/paddleocr_vl/modular_paddleocr_vl.py b/src/transformers/models/paddleocr_vl/modular_paddleocr_vl.py index 20a897059a4e..02895d6e2576 100644 --- a/src/transformers/models/paddleocr_vl/modular_paddleocr_vl.py +++ b/src/transformers/models/paddleocr_vl/modular_paddleocr_vl.py @@ -38,9 +38,8 @@ from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPast, BaseModelOutputWithPooling from ...modeling_utils import PreTrainedModel from ...models.qwen2_vl.image_processing_pil_qwen2_vl import Qwen2VLImageProcessorPil -from ...models.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor +from ...models.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor, Qwen2VLImageProcessorKwargs from ...processing_utils import ( - ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack, @@ -123,7 +122,7 @@ def smart_resize( return h_bar, w_bar -class PaddleOCRVLImageProcessorKwargs(ImagesKwargs, total=False): +class PaddleOCRVLImageProcessorKwargs(Qwen2VLImageProcessorKwargs): r""" patch_size (`int`, *optional*, defaults to 14): The spatial patch size of the vision encoder. @@ -133,12 +132,6 @@ class PaddleOCRVLImageProcessorKwargs(ImagesKwargs, total=False): The merge size of the vision encoder to llm encoder. """ - min_pixels: int - max_pixels: int - patch_size: int - temporal_patch_size: int - merge_size: int - class PaddleOCRVLImageProcessorPil(Qwen2VLImageProcessorPil): size = {"shortest_edge": 384 * 384, "longest_edge": 1536 * 1536} diff --git a/src/transformers/models/rt_detr/modular_rt_detr.py b/src/transformers/models/rt_detr/modular_rt_detr.py index 97136541d6ec..cd4e8faf3fc2 100644 --- a/src/transformers/models/rt_detr/modular_rt_detr.py +++ b/src/transformers/models/rt_detr/modular_rt_detr.py @@ -426,20 +426,6 @@ def post_process_panoptic_segmentation(self): raise NotImplementedError("Panoptic segmentation post-processing is not implemented for RT-DETR yet.") -class RTDetrImageProcessorKwargs(ImagesKwargs, total=False): - r""" - format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): - Data format of the annotations. One of "coco_detection" or "coco_panoptic". - do_convert_annotations (`bool`, *optional*, defaults to `True`): - Controls whether to convert the annotations to the format expected by the RT_DETR model. Converts the - bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. - Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. - """ - - format: str | AnnotationFormat - do_convert_annotations: bool - - @requires(backends=("torch",)) class RTDetrImageProcessorPil(DetrImageProcessorPil): resample = PILImageResampling.BILINEAR diff --git a/src/transformers/models/segformer/modular_segformer.py b/src/transformers/models/segformer/modular_segformer.py index d7f339ea6e42..414dc58e8c52 100644 --- a/src/transformers/models/segformer/modular_segformer.py +++ b/src/transformers/models/segformer/modular_segformer.py @@ -31,22 +31,10 @@ PILImageResampling, SizeDict, ) -from ...processing_utils import ImagesKwargs from ...utils import TensorType from ...utils.import_utils import requires -class SegformerImageProcessorKwargs(ImagesKwargs, total=False): - r""" - do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`): - Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 - is used for background, and background itself is not included in all classes of a dataset (e.g. - ADE20k). The background label will be replaced by 255. - """ - - do_reduce_labels: bool - - class SegformerImageProcessor(BeitImageProcessor): resample = PILImageResampling.BILINEAR image_mean = IMAGENET_DEFAULT_MEAN diff --git a/src/transformers/models/smolvlm/modular_smolvlm.py b/src/transformers/models/smolvlm/modular_smolvlm.py index 9c572cc9d877..cf91863c56a7 100644 --- a/src/transformers/models/smolvlm/modular_smolvlm.py +++ b/src/transformers/models/smolvlm/modular_smolvlm.py @@ -22,7 +22,7 @@ from ...generation import GenerationConfig from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_outputs import BaseModelOutputWithPooling -from ...processing_utils import ImagesKwargs, Unpack +from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging, torch_compilable_check from ..idefics3.configuration_idefics3 import Idefics3Config, Idefics3VisionConfig from ..idefics3.image_processing_idefics3 import Idefics3ImageProcessor @@ -91,22 +91,6 @@ class SmolVLMConfig(Idefics3Config): model_type = "smolvlm" -class SmolVLMImageProcessorKwargs(ImagesKwargs, total=False): - """ - do_image_splitting (`bool`, *optional*, defaults to `True`): - Whether to split the image into sub-images concatenated with the original image. They are split into patches - such that each patch has a size of `max_image_size["height"]` x `max_image_size["width"]`. - max_image_size (`Dict`, *optional*, defaults to `{"longest_edge": 364}`): - Maximum resolution of the patches of images accepted by the model. This is a dictionary containing the key "longest_edge". - return_row_col_info (`bool`, *optional*, defaults to `False`): - Whether to return the row and column information of the images. - """ - - do_image_splitting: bool - max_image_size: dict[str, int] - return_row_col_info: bool - - class SmolVLMImageProcessor(Idefics3ImageProcessor): pass diff --git a/src/transformers/models/video_llama_3/modular_video_llama_3.py b/src/transformers/models/video_llama_3/modular_video_llama_3.py index 4eef74580c87..c4a9e40bc8f0 100644 --- a/src/transformers/models/video_llama_3/modular_video_llama_3.py +++ b/src/transformers/models/video_llama_3/modular_video_llama_3.py @@ -37,7 +37,7 @@ ) from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ModelOutput from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel -from ...processing_utils import ImagesKwargs, Unpack +from ...processing_utils import Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import ( TensorType, @@ -55,7 +55,7 @@ from ..auto import CONFIG_MAPPING, AutoConfig from ..auto.modeling_auto import AutoModel from ..qwen2_vl.image_processing_pil_qwen2_vl import Qwen2VLImageProcessorPil -from ..qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor, smart_resize +from ..qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor, Qwen2VLImageProcessorKwargs, smart_resize from ..qwen2_vl.modeling_qwen2_vl import ( Qwen2VLForConditionalGeneration, Qwen2VLModel, @@ -1107,25 +1107,8 @@ def model_input_names(self): raise AttributeError("VideoLlama doesn't need to override it") -class VideoLlama3ImageProcessorKwargs(ImagesKwargs, total=False): - r""" - min_pixels (`int`, *optional*, defaults to `56 * 56`): - The min pixels of the image to resize the image. - max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`): - The max pixels of the image to resize the image. - patch_size (`int`, *optional*, defaults to 14): - The spatial patch size of the vision encoder. - temporal_patch_size (`int`, *optional*, defaults to 2): - The temporal patch size of the vision encoder. - merge_size (`int`, *optional*, defaults to 2): - The merge size of the vision encoder to llm encoder. - """ - - min_pixels: int - max_pixels: int - patch_size: int - temporal_patch_size: int - merge_size: int +class VideoLlama3ImageProcessorKwargs(Qwen2VLImageProcessorKwargs): + pass class VideoLlama3ImageProcessorPil(Qwen2VLImageProcessorPil): From ef6419a6dd96620af5c58db50262e86287d1b6c8 Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Tue, 21 Apr 2026 09:42:58 +0900 Subject: [PATCH 3/9] forgot docstring --- .../models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py b/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py index 0e9f27d5c41d..ad47bc0508a3 100644 --- a/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +++ b/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py @@ -1221,7 +1221,14 @@ def forward( class Ernie4_5_VLMoeImageProcessorKwargs(Glm4vImageProcessorKwargs): - pass + r""" + patch_size (`int`, *optional*, defaults to 14): + The spatial patch size of the vision encoder. + temporal_patch_size (`int`, *optional*): + The temporal patch size of the vision encoder. Unused in the image processor, only used for videos. + merge_size (`int`, *optional*, defaults to 2): + The merge size of the vision encoder to llm encoder. + """ class Ernie4_5_VLMoeImageProcessorPil(Glm4vImageProcessorPil): From 1c2343b2b5a57a077e65dba6094c1f2f365c893d Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Tue, 21 Apr 2026 09:59:45 +0900 Subject: [PATCH 4/9] oups --- utils/modular_model_converter.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/utils/modular_model_converter.py b/utils/modular_model_converter.py index 5fd453816f54..d5dc7dfe23b6 100644 --- a/utils/modular_model_converter.py +++ b/utils/modular_model_converter.py @@ -1755,13 +1755,7 @@ class node based on the inherited classes if needed. Also returns any new import relative_dependency_order = mapper.compute_relative_order(all_dependencies_to_add) nodes_to_add = { - dep: ( - relative_dependency_order[dep], - # If this dependency is explicitly defined in the modular, prefer the modular's version. - # This prevents a renamed parent class from overriding a modular-defined class of the same name. - modular_mapper.global_nodes[dep] if dep in modular_mapper.classes else mapper.global_nodes[dep], - ) - for dep in all_dependencies_to_add + dep: (relative_dependency_order[dep], mapper.global_nodes[dep]) for dep in all_dependencies_to_add } # No transformers (modeling file) super class, just check functions and assignments dependencies @@ -1779,7 +1773,9 @@ class node based on the inherited classes if needed. Also returns any new import relative_dependency_order = modular_mapper.compute_relative_order(all_dependencies_to_add) nodes_to_add = { - dep: (relative_dependency_order[dep], mapper.global_nodes[dep]) for dep in all_dependencies_to_add + dep: (relative_dependency_order[dep], modular_mapper.global_nodes[dep]) + for dep in all_dependencies_to_add + if dep not in file_to_update } # Add the class node itself to the nodes to add From 56c04dff404b45194920d012a5671ea6cfbb13d0 Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Tue, 21 Apr 2026 10:02:57 +0900 Subject: [PATCH 5/9] apply modular converter --- src/transformers/dependency_versions_table.py | 1 + .../image_processing_pil_conditional_detr.py | 18 ++--------- .../image_processing_pil_deepseek_vl.py | 13 ++------ ...image_processing_pil_deepseek_vl_hybrid.py | 30 ++----------------- .../image_processing_pil_deformable_detr.py | 18 ++--------- .../image_processing_pil_efficientloftr.py | 12 ++------ .../image_processing_pil_ernie4_5_vl_moe.py | 18 ++--------- .../glm46v/image_processing_pil_glm46v.py | 18 ++--------- .../image_processing_pil_glm_image.py | 25 ++-------------- .../image_processing_pil_grounding_dino.py | 18 ++--------- .../image_processing_pil_lightglue.py | 12 ++------ .../image_processing_pil_llava_onevision.py | 14 ++------- .../image_processing_pil_mask2former.py | 29 ++---------------- .../image_processing_pil_paddleocr_vl.py | 20 ++----------- .../rt_detr/image_processing_pil_rt_detr.py | 18 ++--------- .../image_processing_pil_segformer.py | 14 ++------- .../smolvlm/image_processing_pil_smolvlm.py | 19 ++---------- .../image_processing_pil_video_llama_3.py | 24 ++------------- .../yolos/image_processing_pil_yolos.py | 19 ++---------- 19 files changed, 37 insertions(+), 303 deletions(-) diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index b08aa558d795..0456904dd3d5 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -56,6 +56,7 @@ "rjieba": "rjieba", "rouge-score": "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1", "ruff": "ruff==0.14.10", + "transformers-mlinter": "transformers-mlinter @ git+https://github.com/huggingface/transformers-mlinter@b9d319ce264c106f97a959d926ef42bc3c0ea4d1", "ty": "ty==0.0.20", "sacrebleu": "sacrebleu>=1.4.12,<2.0.0", "sacremoses": "sacremoses", diff --git a/src/transformers/models/conditional_detr/image_processing_pil_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_pil_conditional_detr.py index 359c4c706f7c..30b6e2752273 100644 --- a/src/transformers/models/conditional_detr/image_processing_pil_conditional_detr.py +++ b/src/transformers/models/conditional_detr/image_processing_pil_conditional_detr.py @@ -48,9 +48,10 @@ get_max_height_width, validate_annotations, ) -from ...processing_utils import ImagesKwargs, Unpack +from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring, is_torch_available, is_vision_available, logging, requires_backends from ...utils.import_utils import requires +from .image_processing_conditional_detr import ConditionalDetrImageProcessorKwargs if is_vision_available(): @@ -61,21 +62,6 @@ logger = logging.get_logger(__name__) - -class ConditionalDetrImageProcessorKwargs(ImagesKwargs, total=False): - r""" - format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): - Data format of the annotations. One of "coco_detection" or "coco_panoptic". - do_convert_annotations (`bool`, *optional*, defaults to `True`): - Controls whether to convert the annotations to the format expected by the CONDITIONAL_DETR model. Converts the - bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. - Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. - """ - - format: str | AnnotationFormat - do_convert_annotations: bool - - SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) diff --git a/src/transformers/models/deepseek_vl/image_processing_pil_deepseek_vl.py b/src/transformers/models/deepseek_vl/image_processing_pil_deepseek_vl.py index e868830b0220..d29296535277 100644 --- a/src/transformers/models/deepseek_vl/image_processing_pil_deepseek_vl.py +++ b/src/transformers/models/deepseek_vl/image_processing_pil_deepseek_vl.py @@ -32,18 +32,9 @@ PILImageResampling, SizeDict, ) -from ...processing_utils import ImagesKwargs, Unpack +from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring - - -class DeepseekVLImageProcessorKwargs(ImagesKwargs, total=False): - r""" - min_size (`int`, *optional*, defaults to 14): - The minimum allowed size for the resized image. Ensures that neither the height nor width - falls below this value after resizing. - """ - - min_size: int +from .image_processing_deepseek_vl import DeepseekVLImageProcessorKwargs @auto_docstring diff --git a/src/transformers/models/deepseek_vl_hybrid/image_processing_pil_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/image_processing_pil_deepseek_vl_hybrid.py index 55573c35c423..c7ef92dce05f 100644 --- a/src/transformers/models/deepseek_vl_hybrid/image_processing_pil_deepseek_vl_hybrid.py +++ b/src/transformers/models/deepseek_vl_hybrid/image_processing_pil_deepseek_vl_hybrid.py @@ -19,7 +19,6 @@ # limitations under the License. from collections.abc import Iterable -from typing import Union import numpy as np @@ -34,34 +33,9 @@ PILImageResampling, SizeDict, ) -from ...processing_utils import ImagesKwargs, Unpack +from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring - - -class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs, total=False): - r""" - min_size (`int`, *optional*, defaults to 14): - The minimum allowed size for the resized image. Ensures that neither the height nor width - falls below this value after resizing. - high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`): - Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess` - method. - high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): - Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be - overridden by the `high_res_resample` parameter in the `preprocess` method. - high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`): - Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of - channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method. - high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`): - Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the - number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method. - """ - - min_size: int - high_res_size: dict - high_res_resample: Union["PILImageResampling", int] - high_res_image_mean: float | list[float] | tuple[float, ...] - high_res_image_std: float | list[float] | tuple[float, ...] +from .image_processing_deepseek_vl_hybrid import DeepseekVLHybridImageProcessorKwargs @auto_docstring diff --git a/src/transformers/models/deformable_detr/image_processing_pil_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_pil_deformable_detr.py index fcd95fa4647f..dd66876deca4 100644 --- a/src/transformers/models/deformable_detr/image_processing_pil_deformable_detr.py +++ b/src/transformers/models/deformable_detr/image_processing_pil_deformable_detr.py @@ -47,9 +47,10 @@ get_max_height_width, validate_annotations, ) -from ...processing_utils import ImagesKwargs, Unpack +from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring, is_torch_available, is_vision_available from ...utils.import_utils import requires, requires_backends +from .image_processing_deformable_detr import DeformableDetrImageProcessorKwargs if is_vision_available(): @@ -57,21 +58,6 @@ if is_torch_available(): import torch - -class DeformableDetrImageProcessorKwargs(ImagesKwargs, total=False): - r""" - format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): - Data format of the annotations. One of "coco_detection" or "coco_panoptic". - do_convert_annotations (`bool`, *optional*, defaults to `True`): - Controls whether to convert the annotations to the format expected by the DEFORMABLE_DETR model. Converts the - bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. - Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. - """ - - format: str | AnnotationFormat - do_convert_annotations: bool - - SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) diff --git a/src/transformers/models/efficientloftr/image_processing_pil_efficientloftr.py b/src/transformers/models/efficientloftr/image_processing_pil_efficientloftr.py index 5f467c56dd4f..66f7314143f3 100644 --- a/src/transformers/models/efficientloftr/image_processing_pil_efficientloftr.py +++ b/src/transformers/models/efficientloftr/image_processing_pil_efficientloftr.py @@ -21,9 +21,10 @@ is_valid_image, to_numpy_array, ) -from ...processing_utils import ImagesKwargs, Unpack +from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring, is_torch_available from ...utils.import_utils import requires +from .image_processing_efficientloftr import EfficientLoFTRImageProcessorKwargs if TYPE_CHECKING: @@ -32,15 +33,6 @@ import torch -class EfficientLoFTRImageProcessorKwargs(ImagesKwargs, total=False): - r""" - do_grayscale (`bool`, *optional*, defaults to `self.do_grayscale`): - Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method. - """ - - do_grayscale: bool - - def is_grayscale(image: np.ndarray): if image.shape[0] == 1: return True diff --git a/src/transformers/models/ernie4_5_vl_moe/image_processing_pil_ernie4_5_vl_moe.py b/src/transformers/models/ernie4_5_vl_moe/image_processing_pil_ernie4_5_vl_moe.py index 7f372c3af02d..8aed9c816627 100644 --- a/src/transformers/models/ernie4_5_vl_moe/image_processing_pil_ernie4_5_vl_moe.py +++ b/src/transformers/models/ernie4_5_vl_moe/image_processing_pil_ernie4_5_vl_moe.py @@ -25,28 +25,14 @@ from ...image_processing_backends import PilBackend from ...image_processing_utils import BatchFeature from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ImageInput, PILImageResampling, SizeDict -from ...processing_utils import ImagesKwargs, Unpack +from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring, logging +from .image_processing_ernie4_5_vl_moe import Ernie4_5_VLMoeImageProcessorKwargs logger = logging.get_logger(__name__) -class Ernie4_5_VLMoeImageProcessorKwargs(ImagesKwargs, total=False): - r""" - patch_size (`int`, *optional*, defaults to 14): - The spatial patch size of the vision encoder. - temporal_patch_size (`int`, *optional*): - The temporal patch size of the vision encoder. Unused in the image processor, only used for videos. - merge_size (`int`, *optional*, defaults to 2): - The merge size of the vision encoder to llm encoder. - """ - - patch_size: int - temporal_patch_size: int - merge_size: int - - def smart_resize( height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 1280 ): diff --git a/src/transformers/models/glm46v/image_processing_pil_glm46v.py b/src/transformers/models/glm46v/image_processing_pil_glm46v.py index 5601e732c2b3..934988f738c8 100644 --- a/src/transformers/models/glm46v/image_processing_pil_glm46v.py +++ b/src/transformers/models/glm46v/image_processing_pil_glm46v.py @@ -26,23 +26,9 @@ from ...image_processing_backends import PilBackend from ...image_processing_utils import BatchFeature from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ImageInput, PILImageResampling, SizeDict -from ...processing_utils import ImagesKwargs, Unpack +from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring - - -class Glm46VImageProcessorKwargs(ImagesKwargs, total=False): - """ - patch_size (`int`, *optional*, defaults to 14): - The spatial patch size of the vision encoder. - temporal_patch_size (`int`, *optional*, defaults to 2): - The temporal patch size of the vision encoder. - merge_size (`int`, *optional*, defaults to 2): - The merge size of the vision encoder to llm encoder. - """ - - patch_size: int - temporal_patch_size: int - merge_size: int +from .image_processing_glm46v import Glm46VImageProcessorKwargs # Adapted from transformers.models.glm46v.image_processing_glm46v.smart_resize diff --git a/src/transformers/models/glm_image/image_processing_pil_glm_image.py b/src/transformers/models/glm_image/image_processing_pil_glm_image.py index 2dde18ef2066..0aaf95a9aaea 100644 --- a/src/transformers/models/glm_image/image_processing_pil_glm_image.py +++ b/src/transformers/models/glm_image/image_processing_pil_glm_image.py @@ -26,30 +26,9 @@ from ...feature_extraction_utils import BatchFeature from ...image_processing_backends import PilBackend from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ImageInput, PILImageResampling, SizeDict -from ...processing_utils import ImagesKwargs, Unpack +from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring - - -# Adapted from transformers.models.glm_image.image_processing_glm_image.GlmImageImageProcessorKwargs -class GlmImageImageProcessorKwargs(ImagesKwargs, total=False): - r""" - min_pixels (`int`, *optional*, defaults to `56 * 56`): - The min pixels of the image to resize the image. - max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`): - The max pixels of the image to resize the image. - patch_size (`int`, *optional*, defaults to 14): - The spatial patch size of the vision encoder. - temporal_patch_size (`int`, *optional*, defaults to 2): - The temporal patch size of the vision encoder. - merge_size (`int`, *optional*, defaults to 2): - The merge size of the vision encoder to llm encoder. - """ - - min_pixels: int - max_pixels: int - patch_size: int - temporal_patch_size: int - merge_size: int +from .image_processing_glm_image import GlmImageImageProcessorKwargs def smart_resize( diff --git a/src/transformers/models/grounding_dino/image_processing_pil_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_pil_grounding_dino.py index 31c59e5f3930..fbdbef4110b4 100644 --- a/src/transformers/models/grounding_dino/image_processing_pil_grounding_dino.py +++ b/src/transformers/models/grounding_dino/image_processing_pil_grounding_dino.py @@ -53,9 +53,10 @@ get_max_height_width, validate_annotations, ) -from ...processing_utils import ImagesKwargs, Unpack +from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring, is_torch_available, is_vision_available, requires_backends from ...utils.import_utils import requires +from .image_processing_grounding_dino import GroundingDinoImageProcessorKwargs if TYPE_CHECKING: @@ -67,21 +68,6 @@ if is_torch_available(): import torch - -class GroundingDinoImageProcessorKwargs(ImagesKwargs, total=False): - r""" - format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): - Data format of the annotations. One of "coco_detection" or "coco_panoptic". - do_convert_annotations (`bool`, *optional*, defaults to `True`): - Controls whether to convert the annotations to the format expected by the GROUNDING_DINO model. Converts the - bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. - Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. - """ - - format: str | AnnotationFormat - do_convert_annotations: bool - - SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) diff --git a/src/transformers/models/lightglue/image_processing_pil_lightglue.py b/src/transformers/models/lightglue/image_processing_pil_lightglue.py index 9f43fe1bbc7a..6283a2e1a2c5 100644 --- a/src/transformers/models/lightglue/image_processing_pil_lightglue.py +++ b/src/transformers/models/lightglue/image_processing_pil_lightglue.py @@ -35,9 +35,10 @@ is_valid_image, to_numpy_array, ) -from ...processing_utils import ImagesKwargs, Unpack +from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring, is_torch_available from ...utils.import_utils import requires +from .image_processing_lightglue import LightGlueImageProcessorKwargs if TYPE_CHECKING: @@ -46,15 +47,6 @@ import torch -class LightGlueImageProcessorKwargs(ImagesKwargs, total=False): - r""" - do_grayscale (`bool`, *optional*, defaults to `self.do_grayscale`): - Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method. - """ - - do_grayscale: bool - - def is_grayscale(image: np.ndarray): if image.shape[0] == 1: return True diff --git a/src/transformers/models/llava_onevision/image_processing_pil_llava_onevision.py b/src/transformers/models/llava_onevision/image_processing_pil_llava_onevision.py index 23534a65d70f..b894b72025b9 100644 --- a/src/transformers/models/llava_onevision/image_processing_pil_llava_onevision.py +++ b/src/transformers/models/llava_onevision/image_processing_pil_llava_onevision.py @@ -32,19 +32,9 @@ PILImageResampling, SizeDict, ) -from ...processing_utils import ImagesKwargs, Unpack +from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring - - -class LlavaOnevisionImageProcessorKwargs(ImagesKwargs, total=False): - r""" - image_grid_pinpoints (`list[list[int]]`, *optional*): - A list of possible resolutions to use for processing high resolution images. The best resolution is selected - based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess` - method. - """ - - image_grid_pinpoints: list[list[int]] +from .image_processing_llava_onevision import LlavaOnevisionImageProcessorKwargs @auto_docstring diff --git a/src/transformers/models/mask2former/image_processing_pil_mask2former.py b/src/transformers/models/mask2former/image_processing_pil_mask2former.py index 8358a3601bed..2f13d1084ffa 100644 --- a/src/transformers/models/mask2former/image_processing_pil_mask2former.py +++ b/src/transformers/models/mask2former/image_processing_pil_mask2former.py @@ -39,9 +39,10 @@ get_image_size_for_max_height_width, get_max_height_width, ) -from ...processing_utils import ImagesKwargs, Unpack +from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring, is_torch_available, logging, requires_backends from ...utils.import_utils import requires +from .image_processing_mask2former import Mask2FormerImageProcessorKwargs if is_torch_available(): @@ -51,32 +52,6 @@ logger = logging.get_logger(__name__) -class Mask2FormerImageProcessorKwargs(ImagesKwargs, total=False): - r""" - ignore_index (`int`, *optional*): - Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels - denoted with 0 (background) will be replaced with `ignore_index`. - do_reduce_labels (`bool`, *optional*, defaults to `False`): - Whether or not to decrement all label values of segmentation maps by 1. Usually used for datasets where 0 - is used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k). - The background label will be replaced by `ignore_index`. - num_labels (`int`, *optional*): - The number of labels in the segmentation map. - size_divisor (`int`, *optional*, defaults to `32`): - Some backbones need images divisible by a certain number. If not passed, it defaults to the value used in - Swin Transformer. - pad_size (`SizeDict`, *optional*): - The size to pad the images to. Must be larger than any image size provided for preprocessing. If `pad_size` - is not provided, images will be padded to the largest height and width in the batch. - """ - - ignore_index: int | None - do_reduce_labels: bool - num_labels: int | None - size_divisor: int - pad_size: SizeDict | None - - def convert_segmentation_map_to_binary_masks( segmentation_map: np.ndarray, instance_id_to_semantic_id: dict[int, int] | None = None, diff --git a/src/transformers/models/paddleocr_vl/image_processing_pil_paddleocr_vl.py b/src/transformers/models/paddleocr_vl/image_processing_pil_paddleocr_vl.py index ac639892640f..560b7869ddb9 100644 --- a/src/transformers/models/paddleocr_vl/image_processing_pil_paddleocr_vl.py +++ b/src/transformers/models/paddleocr_vl/image_processing_pil_paddleocr_vl.py @@ -31,25 +31,9 @@ from ...image_processing_backends import PilBackend from ...image_processing_utils import BatchFeature from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ImageInput, PILImageResampling, SizeDict -from ...processing_utils import ImagesKwargs, Unpack +from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring - - -class PaddleOCRVLImageProcessorKwargs(ImagesKwargs, total=False): - r""" - patch_size (`int`, *optional*, defaults to 14): - The spatial patch size of the vision encoder. - temporal_patch_size (`int`, *optional*, defaults to 1): - The temporal patch size of the vision encoder. - merge_size (`int`, *optional*, defaults to 2): - The merge size of the vision encoder to llm encoder. - """ - - min_pixels: int - max_pixels: int - patch_size: int - temporal_patch_size: int - merge_size: int +from .image_processing_paddleocr_vl import PaddleOCRVLImageProcessorKwargs def smart_resize( diff --git a/src/transformers/models/rt_detr/image_processing_pil_rt_detr.py b/src/transformers/models/rt_detr/image_processing_pil_rt_detr.py index 1fe55d067653..606b5640602c 100644 --- a/src/transformers/models/rt_detr/image_processing_pil_rt_detr.py +++ b/src/transformers/models/rt_detr/image_processing_pil_rt_detr.py @@ -46,29 +46,15 @@ get_max_height_width, validate_annotations, ) -from ...processing_utils import ImagesKwargs, Unpack +from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring, is_torch_available, requires_backends from ...utils.import_utils import requires +from .image_processing_rt_detr import RTDetrImageProcessorKwargs if is_torch_available(): import torch - -class RTDetrImageProcessorKwargs(ImagesKwargs, total=False): - r""" - format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): - Data format of the annotations. One of "coco_detection" or "coco_panoptic". - do_convert_annotations (`bool`, *optional*, defaults to `True`): - Controls whether to convert the annotations to the format expected by the RT_DETR model. Converts the - bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. - Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. - """ - - format: str | AnnotationFormat - do_convert_annotations: bool - - SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) diff --git a/src/transformers/models/segformer/image_processing_pil_segformer.py b/src/transformers/models/segformer/image_processing_pil_segformer.py index f1d0bb0f627b..77514873c59a 100644 --- a/src/transformers/models/segformer/image_processing_pil_segformer.py +++ b/src/transformers/models/segformer/image_processing_pil_segformer.py @@ -31,9 +31,10 @@ PILImageResampling, SizeDict, ) -from ...processing_utils import ImagesKwargs, Unpack +from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring, is_torch_available, is_torchvision_available from ...utils.import_utils import requires +from .image_processing_segformer import SegformerImageProcessorKwargs if is_torch_available(): @@ -42,17 +43,6 @@ import torchvision.transforms.v2.functional as tvF -class SegformerImageProcessorKwargs(ImagesKwargs, total=False): - r""" - do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`): - Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 - is used for background, and background itself is not included in all classes of a dataset (e.g. - ADE20k). The background label will be replaced by 255. - """ - - do_reduce_labels: bool - - @requires(backends=("torch", "torchvision")) class SegformerImageProcessorPil(PilBackend): """PIL backend for Segformer with reduce_label support.""" diff --git a/src/transformers/models/smolvlm/image_processing_pil_smolvlm.py b/src/transformers/models/smolvlm/image_processing_pil_smolvlm.py index 3d53ed09c11f..29f3a89f3418 100644 --- a/src/transformers/models/smolvlm/image_processing_pil_smolvlm.py +++ b/src/transformers/models/smolvlm/image_processing_pil_smolvlm.py @@ -35,24 +35,9 @@ SizeDict, make_nested_list_of_images, ) -from ...processing_utils import ImagesKwargs, Unpack +from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring - - -class SmolVLMImageProcessorKwargs(ImagesKwargs, total=False): - """ - do_image_splitting (`bool`, *optional*, defaults to `True`): - Whether to split the image into sub-images concatenated with the original image. They are split into patches - such that each patch has a size of `max_image_size["height"]` x `max_image_size["width"]`. - max_image_size (`Dict`, *optional*, defaults to `{"longest_edge": 364}`): - Maximum resolution of the patches of images accepted by the model. This is a dictionary containing the key "longest_edge". - return_row_col_info (`bool`, *optional*, defaults to `False`): - Whether to return the row and column information of the images. - """ - - do_image_splitting: bool - max_image_size: dict[str, int] - return_row_col_info: bool +from .image_processing_smolvlm import SmolVLMImageProcessorKwargs def _make_pixel_mask(image: np.ndarray, output_size: tuple[int, int]) -> np.ndarray: diff --git a/src/transformers/models/video_llama_3/image_processing_pil_video_llama_3.py b/src/transformers/models/video_llama_3/image_processing_pil_video_llama_3.py index 5272c7465b2b..46f1cbb7d25d 100644 --- a/src/transformers/models/video_llama_3/image_processing_pil_video_llama_3.py +++ b/src/transformers/models/video_llama_3/image_processing_pil_video_llama_3.py @@ -26,29 +26,9 @@ from ...feature_extraction_utils import BatchFeature from ...image_processing_backends import PilBackend from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, ImageInput, PILImageResampling, SizeDict -from ...processing_utils import ImagesKwargs, Unpack +from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring - - -class VideoLlama3ImageProcessorKwargs(ImagesKwargs, total=False): - r""" - min_pixels (`int`, *optional*, defaults to `56 * 56`): - The min pixels of the image to resize the image. - max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`): - The max pixels of the image to resize the image. - patch_size (`int`, *optional*, defaults to 14): - The spatial patch size of the vision encoder. - temporal_patch_size (`int`, *optional*, defaults to 2): - The temporal patch size of the vision encoder. - merge_size (`int`, *optional*, defaults to 2): - The merge size of the vision encoder to llm encoder. - """ - - min_pixels: int - max_pixels: int - patch_size: int - temporal_patch_size: int - merge_size: int +from .image_processing_video_llama_3 import VideoLlama3ImageProcessorKwargs def smart_resize( diff --git a/src/transformers/models/yolos/image_processing_pil_yolos.py b/src/transformers/models/yolos/image_processing_pil_yolos.py index 219348363ea3..7f5b8385d8b9 100644 --- a/src/transformers/models/yolos/image_processing_pil_yolos.py +++ b/src/transformers/models/yolos/image_processing_pil_yolos.py @@ -33,9 +33,10 @@ get_max_height_width, validate_annotations, ) -from ...processing_utils import ImagesKwargs, Unpack +from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring, is_torch_available, is_vision_available, requires_backends from ...utils.import_utils import requires +from .image_processing_yolos import YolosImageProcessorKwargs if is_vision_available(): @@ -44,22 +45,6 @@ import torch from torch import nn - -# Adapted from transformers.models.yolos.image_processing_yolos.YolosImageProcessorKwargs -class YolosImageProcessorKwargs(ImagesKwargs, total=False): - r""" - format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): - Data format of the annotations. One of "coco_detection" or "coco_panoptic". - do_convert_annotations (`bool`, *optional*, defaults to `True`): - Controls whether to convert the annotations to the format expected by the YOLOS model. Converts the - bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. - Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. - """ - - format: str | AnnotationFormat - do_convert_annotations: bool - - SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) From fc6dc66c0e382888ef5df92870cf643dcce2ba45 Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Tue, 21 Apr 2026 18:07:55 +0900 Subject: [PATCH 6/9] fix modular to always copy --- .../image_processing_pil_conditional_detr.py | 17 ++++++++-- .../image_processing_pil_deepseek_vl.py | 13 +++++-- ...image_processing_pil_deepseek_vl_hybrid.py | 30 ++++++++++++++-- .../image_processing_pil_deformable_detr.py | 17 ++++++++-- .../image_processing_pil_efficientloftr.py | 12 +++++-- .../image_processing_pil_ernie4_5_vl_moe.py | 18 ++++++++-- .../glm46v/image_processing_pil_glm46v.py | 18 ++++++++-- .../image_processing_pil_glm_image.py | 24 +++++++++++-- .../image_processing_pil_grounding_dino.py | 17 ++++++++-- .../image_processing_pil_lightglue.py | 12 +++++-- .../image_processing_pil_llava_onevision.py | 14 ++++++-- .../image_processing_pil_mask2former.py | 29 ++++++++++++++-- .../image_processing_pil_paddleocr_vl.py | 20 +++++++++-- .../rt_detr/image_processing_pil_rt_detr.py | 17 ++++++++-- .../image_processing_pil_segformer.py | 14 ++++++-- .../smolvlm/image_processing_pil_smolvlm.py | 19 +++++++++-- .../image_processing_pil_video_llama_3.py | 24 +++++++++++-- .../yolos/image_processing_pil_yolos.py | 17 ++++++++-- utils/modular_model_converter.py | 34 +++++++++++++++++++ 19 files changed, 330 insertions(+), 36 deletions(-) diff --git a/src/transformers/models/conditional_detr/image_processing_pil_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_pil_conditional_detr.py index 30b6e2752273..30740114d5f0 100644 --- a/src/transformers/models/conditional_detr/image_processing_pil_conditional_detr.py +++ b/src/transformers/models/conditional_detr/image_processing_pil_conditional_detr.py @@ -48,10 +48,9 @@ get_max_height_width, validate_annotations, ) -from ...processing_utils import Unpack +from ...processing_utils import ImagesKwargs, Unpack from ...utils import TensorType, auto_docstring, is_torch_available, is_vision_available, logging, requires_backends from ...utils.import_utils import requires -from .image_processing_conditional_detr import ConditionalDetrImageProcessorKwargs if is_vision_available(): @@ -65,6 +64,20 @@ SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) +class ConditionalDetrImageProcessorKwargs(ImagesKwargs, total=False): + r""" + format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): + Data format of the annotations. One of "coco_detection" or "coco_panoptic". + do_convert_annotations (`bool`, *optional*, defaults to `True`): + Controls whether to convert the annotations to the format expected by the CONDITIONAL_DETR model. Converts the + bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. + Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. + """ + + format: str | AnnotationFormat + do_convert_annotations: bool + + # inspired by https://github.com/facebookresearch/conditional_detr/blob/master/datasets/coco.py#L33 def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray: """ diff --git a/src/transformers/models/deepseek_vl/image_processing_pil_deepseek_vl.py b/src/transformers/models/deepseek_vl/image_processing_pil_deepseek_vl.py index d29296535277..6e2a220e3fd2 100644 --- a/src/transformers/models/deepseek_vl/image_processing_pil_deepseek_vl.py +++ b/src/transformers/models/deepseek_vl/image_processing_pil_deepseek_vl.py @@ -32,9 +32,8 @@ PILImageResampling, SizeDict, ) -from ...processing_utils import Unpack +from ...processing_utils import ImagesKwargs, Unpack from ...utils import TensorType, auto_docstring -from .image_processing_deepseek_vl import DeepseekVLImageProcessorKwargs @auto_docstring @@ -162,4 +161,14 @@ def postprocess(self): raise AttributeError("Not needed for DeepseekVL") +class DeepseekVLImageProcessorKwargs(ImagesKwargs, total=False): + r""" + min_size (`int`, *optional*, defaults to 14): + The minimum allowed size for the resized image. Ensures that neither the height nor width + falls below this value after resizing. + """ + + min_size: int + + __all__ = ["DeepseekVLImageProcessorPil"] diff --git a/src/transformers/models/deepseek_vl_hybrid/image_processing_pil_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/image_processing_pil_deepseek_vl_hybrid.py index c7ef92dce05f..b1ea56d48a46 100644 --- a/src/transformers/models/deepseek_vl_hybrid/image_processing_pil_deepseek_vl_hybrid.py +++ b/src/transformers/models/deepseek_vl_hybrid/image_processing_pil_deepseek_vl_hybrid.py @@ -19,6 +19,7 @@ # limitations under the License. from collections.abc import Iterable +from typing import Union import numpy as np @@ -33,9 +34,8 @@ PILImageResampling, SizeDict, ) -from ...processing_utils import Unpack +from ...processing_utils import ImagesKwargs, Unpack from ...utils import TensorType, auto_docstring -from .image_processing_deepseek_vl_hybrid import DeepseekVLHybridImageProcessorKwargs @auto_docstring @@ -232,4 +232,30 @@ def _standardize_kwargs( return kwargs +class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs, total=False): + r""" + min_size (`int`, *optional*, defaults to 14): + The minimum allowed size for the resized image. Ensures that neither the height nor width + falls below this value after resizing. + high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`): + Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess` + method. + high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be + overridden by the `high_res_resample` parameter in the `preprocess` method. + high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`): + Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method. + high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`): + Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method. + """ + + min_size: int + high_res_size: dict + high_res_resample: Union["PILImageResampling", int] + high_res_image_mean: float | list[float] | tuple[float, ...] + high_res_image_std: float | list[float] | tuple[float, ...] + + __all__ = ["DeepseekVLHybridImageProcessorPil"] diff --git a/src/transformers/models/deformable_detr/image_processing_pil_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_pil_deformable_detr.py index dd66876deca4..9c7ccc213910 100644 --- a/src/transformers/models/deformable_detr/image_processing_pil_deformable_detr.py +++ b/src/transformers/models/deformable_detr/image_processing_pil_deformable_detr.py @@ -47,10 +47,9 @@ get_max_height_width, validate_annotations, ) -from ...processing_utils import Unpack +from ...processing_utils import ImagesKwargs, Unpack from ...utils import TensorType, auto_docstring, is_torch_available, is_vision_available from ...utils.import_utils import requires, requires_backends -from .image_processing_deformable_detr import DeformableDetrImageProcessorKwargs if is_vision_available(): @@ -61,6 +60,20 @@ SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) +class DeformableDetrImageProcessorKwargs(ImagesKwargs, total=False): + r""" + format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): + Data format of the annotations. One of "coco_detection" or "coco_panoptic". + do_convert_annotations (`bool`, *optional*, defaults to `True`): + Controls whether to convert the annotations to the format expected by the DEFORMABLE_DETR model. Converts the + bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. + Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. + """ + + format: str | AnnotationFormat + do_convert_annotations: bool + + # inspired by https://github.com/facebookresearch/deformable_detr/blob/master/datasets/coco.py#L33 def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray: """ diff --git a/src/transformers/models/efficientloftr/image_processing_pil_efficientloftr.py b/src/transformers/models/efficientloftr/image_processing_pil_efficientloftr.py index 66f7314143f3..7c42d75f2baa 100644 --- a/src/transformers/models/efficientloftr/image_processing_pil_efficientloftr.py +++ b/src/transformers/models/efficientloftr/image_processing_pil_efficientloftr.py @@ -21,10 +21,9 @@ is_valid_image, to_numpy_array, ) -from ...processing_utils import Unpack +from ...processing_utils import ImagesKwargs, Unpack from ...utils import TensorType, auto_docstring, is_torch_available from ...utils.import_utils import requires -from .image_processing_efficientloftr import EfficientLoFTRImageProcessorKwargs if TYPE_CHECKING: @@ -39,6 +38,15 @@ def is_grayscale(image: np.ndarray): return np.all(image[0, ...] == image[1, ...]) and np.all(image[1, ...] == image[2, ...]) +class EfficientLoFTRImageProcessorKwargs(ImagesKwargs, total=False): + r""" + do_grayscale (`bool`, *optional*, defaults to `self.do_grayscale`): + Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method. + """ + + do_grayscale: bool + + def convert_to_grayscale(image: ImageInput) -> ImageInput: """ Converts an image to grayscale format using the NTSC formula. Only support numpy and PIL Image. diff --git a/src/transformers/models/ernie4_5_vl_moe/image_processing_pil_ernie4_5_vl_moe.py b/src/transformers/models/ernie4_5_vl_moe/image_processing_pil_ernie4_5_vl_moe.py index 8aed9c816627..4b6db850f8da 100644 --- a/src/transformers/models/ernie4_5_vl_moe/image_processing_pil_ernie4_5_vl_moe.py +++ b/src/transformers/models/ernie4_5_vl_moe/image_processing_pil_ernie4_5_vl_moe.py @@ -25,9 +25,8 @@ from ...image_processing_backends import PilBackend from ...image_processing_utils import BatchFeature from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ImageInput, PILImageResampling, SizeDict -from ...processing_utils import Unpack +from ...processing_utils import ImagesKwargs, Unpack from ...utils import TensorType, auto_docstring, logging -from .image_processing_ernie4_5_vl_moe import Ernie4_5_VLMoeImageProcessorKwargs logger = logging.get_logger(__name__) @@ -62,6 +61,21 @@ def smart_resize( return h_bar, w_bar +class Ernie4_5_VLMoeImageProcessorKwargs(ImagesKwargs, total=False): + r""" + patch_size (`int`, *optional*, defaults to 14): + The spatial patch size of the vision encoder. + temporal_patch_size (`int`, *optional*): + The temporal patch size of the vision encoder. Unused in the image processor, only used for videos. + merge_size (`int`, *optional*, defaults to 2): + The merge size of the vision encoder to llm encoder. + """ + + patch_size: int + temporal_patch_size: int + merge_size: int + + @auto_docstring class Ernie4_5_VLMoeImageProcessorPil(PilBackend): do_resize = True diff --git a/src/transformers/models/glm46v/image_processing_pil_glm46v.py b/src/transformers/models/glm46v/image_processing_pil_glm46v.py index 934988f738c8..5070535f6ecf 100644 --- a/src/transformers/models/glm46v/image_processing_pil_glm46v.py +++ b/src/transformers/models/glm46v/image_processing_pil_glm46v.py @@ -26,9 +26,8 @@ from ...image_processing_backends import PilBackend from ...image_processing_utils import BatchFeature from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ImageInput, PILImageResampling, SizeDict -from ...processing_utils import Unpack +from ...processing_utils import ImagesKwargs, Unpack from ...utils import TensorType, auto_docstring -from .image_processing_glm46v import Glm46VImageProcessorKwargs # Adapted from transformers.models.glm46v.image_processing_glm46v.smart_resize @@ -68,6 +67,21 @@ def smart_resize( return h_bar, w_bar +class Glm46VImageProcessorKwargs(ImagesKwargs, total=False): + """ + patch_size (`int`, *optional*, defaults to 14): + The spatial patch size of the vision encoder. + temporal_patch_size (`int`, *optional*, defaults to 2): + The temporal patch size of the vision encoder. + merge_size (`int`, *optional*, defaults to 2): + The merge size of the vision encoder to llm encoder. + """ + + patch_size: int + temporal_patch_size: int + merge_size: int + + @auto_docstring class Glm46VImageProcessorPil(PilBackend): do_resize = True diff --git a/src/transformers/models/glm_image/image_processing_pil_glm_image.py b/src/transformers/models/glm_image/image_processing_pil_glm_image.py index 0aaf95a9aaea..7861cc32a1ae 100644 --- a/src/transformers/models/glm_image/image_processing_pil_glm_image.py +++ b/src/transformers/models/glm_image/image_processing_pil_glm_image.py @@ -26,9 +26,8 @@ from ...feature_extraction_utils import BatchFeature from ...image_processing_backends import PilBackend from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ImageInput, PILImageResampling, SizeDict -from ...processing_utils import Unpack +from ...processing_utils import ImagesKwargs, Unpack from ...utils import TensorType, auto_docstring -from .image_processing_glm_image import GlmImageImageProcessorKwargs def smart_resize( @@ -72,6 +71,27 @@ def smart_resize( return h_bar, w_bar +class GlmImageImageProcessorKwargs(ImagesKwargs, total=False): + r""" + min_pixels (`int`, *optional*, defaults to `56 * 56`): + The min pixels of the image to resize the image. + max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`): + The max pixels of the image to resize the image. + patch_size (`int`, *optional*, defaults to 14): + The spatial patch size of the vision encoder. + temporal_patch_size (`int`, *optional*, defaults to 2): + The temporal patch size of the vision encoder. + merge_size (`int`, *optional*, defaults to 2): + The merge size of the vision encoder to llm encoder. + """ + + min_pixels: int + max_pixels: int + patch_size: int + temporal_patch_size: int + merge_size: int + + @auto_docstring class GlmImageImageProcessorPil(PilBackend): do_resize = True diff --git a/src/transformers/models/grounding_dino/image_processing_pil_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_pil_grounding_dino.py index fbdbef4110b4..c95d7cb386bd 100644 --- a/src/transformers/models/grounding_dino/image_processing_pil_grounding_dino.py +++ b/src/transformers/models/grounding_dino/image_processing_pil_grounding_dino.py @@ -53,10 +53,9 @@ get_max_height_width, validate_annotations, ) -from ...processing_utils import Unpack +from ...processing_utils import ImagesKwargs, Unpack from ...utils import TensorType, auto_docstring, is_torch_available, is_vision_available, requires_backends from ...utils.import_utils import requires -from .image_processing_grounding_dino import GroundingDinoImageProcessorKwargs if TYPE_CHECKING: @@ -71,6 +70,20 @@ SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) +class GroundingDinoImageProcessorKwargs(ImagesKwargs, total=False): + r""" + format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): + Data format of the annotations. One of "coco_detection" or "coco_panoptic". + do_convert_annotations (`bool`, *optional*, defaults to `True`): + Controls whether to convert the annotations to the format expected by the GROUNDING_DINO model. Converts the + bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. + Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. + """ + + format: str | AnnotationFormat + do_convert_annotations: bool + + # inspired by https://github.com/facebookresearch/grounding_dino/blob/master/datasets/coco.py#L33 def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray: """ diff --git a/src/transformers/models/lightglue/image_processing_pil_lightglue.py b/src/transformers/models/lightglue/image_processing_pil_lightglue.py index 6283a2e1a2c5..77389f8e8da3 100644 --- a/src/transformers/models/lightglue/image_processing_pil_lightglue.py +++ b/src/transformers/models/lightglue/image_processing_pil_lightglue.py @@ -35,10 +35,9 @@ is_valid_image, to_numpy_array, ) -from ...processing_utils import Unpack +from ...processing_utils import ImagesKwargs, Unpack from ...utils import TensorType, auto_docstring, is_torch_available from ...utils.import_utils import requires -from .image_processing_lightglue import LightGlueImageProcessorKwargs if TYPE_CHECKING: @@ -53,6 +52,15 @@ def is_grayscale(image: np.ndarray): return np.all(image[0, ...] == image[1, ...]) and np.all(image[1, ...] == image[2, ...]) +class LightGlueImageProcessorKwargs(ImagesKwargs, total=False): + r""" + do_grayscale (`bool`, *optional*, defaults to `self.do_grayscale`): + Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method. + """ + + do_grayscale: bool + + def convert_to_grayscale(image: ImageInput) -> ImageInput: """ Converts an image to grayscale format using the NTSC formula. Only support numpy and PIL Image. diff --git a/src/transformers/models/llava_onevision/image_processing_pil_llava_onevision.py b/src/transformers/models/llava_onevision/image_processing_pil_llava_onevision.py index b894b72025b9..96a973ead67d 100644 --- a/src/transformers/models/llava_onevision/image_processing_pil_llava_onevision.py +++ b/src/transformers/models/llava_onevision/image_processing_pil_llava_onevision.py @@ -32,9 +32,8 @@ PILImageResampling, SizeDict, ) -from ...processing_utils import Unpack +from ...processing_utils import ImagesKwargs, Unpack from ...utils import TensorType, auto_docstring -from .image_processing_llava_onevision import LlavaOnevisionImageProcessorKwargs @auto_docstring @@ -294,4 +293,15 @@ def pad_to_square( return result +class LlavaOnevisionImageProcessorKwargs(ImagesKwargs, total=False): + r""" + image_grid_pinpoints (`list[list[int]]`, *optional*): + A list of possible resolutions to use for processing high resolution images. The best resolution is selected + based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess` + method. + """ + + image_grid_pinpoints: list[list[int]] + + __all__ = ["LlavaOnevisionImageProcessorPil"] diff --git a/src/transformers/models/mask2former/image_processing_pil_mask2former.py b/src/transformers/models/mask2former/image_processing_pil_mask2former.py index 2f13d1084ffa..6b27657b3677 100644 --- a/src/transformers/models/mask2former/image_processing_pil_mask2former.py +++ b/src/transformers/models/mask2former/image_processing_pil_mask2former.py @@ -39,10 +39,9 @@ get_image_size_for_max_height_width, get_max_height_width, ) -from ...processing_utils import Unpack +from ...processing_utils import ImagesKwargs, Unpack from ...utils import TensorType, auto_docstring, is_torch_available, logging, requires_backends from ...utils.import_utils import requires -from .image_processing_mask2former import Mask2FormerImageProcessorKwargs if is_torch_available(): @@ -88,6 +87,32 @@ def convert_segmentation_map_to_binary_masks( return binary_masks.astype(np.float32), labels +class Mask2FormerImageProcessorKwargs(ImagesKwargs, total=False): + r""" + ignore_index (`int`, *optional*): + Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels + denoted with 0 (background) will be replaced with `ignore_index`. + do_reduce_labels (`bool`, *optional*, defaults to `False`): + Whether or not to decrement all label values of segmentation maps by 1. Usually used for datasets where 0 + is used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k). + The background label will be replaced by `ignore_index`. + num_labels (`int`, *optional*): + The number of labels in the segmentation map. + size_divisor (`int`, *optional*, defaults to `32`): + Some backbones need images divisible by a certain number. If not passed, it defaults to the value used in + Swin Transformer. + pad_size (`SizeDict`, *optional*): + The size to pad the images to. Must be larger than any image size provided for preprocessing. If `pad_size` + is not provided, images will be padded to the largest height and width in the batch. + """ + + ignore_index: int | None + do_reduce_labels: bool + num_labels: int | None + size_divisor: int + pad_size: SizeDict | None + + # Adapted from transformers.models.mask2former.image_processing_mask2former.binary_mask_to_rle def binary_mask_to_rle(mask): """ diff --git a/src/transformers/models/paddleocr_vl/image_processing_pil_paddleocr_vl.py b/src/transformers/models/paddleocr_vl/image_processing_pil_paddleocr_vl.py index 560b7869ddb9..c524acc0debc 100644 --- a/src/transformers/models/paddleocr_vl/image_processing_pil_paddleocr_vl.py +++ b/src/transformers/models/paddleocr_vl/image_processing_pil_paddleocr_vl.py @@ -31,9 +31,8 @@ from ...image_processing_backends import PilBackend from ...image_processing_utils import BatchFeature from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ImageInput, PILImageResampling, SizeDict -from ...processing_utils import Unpack +from ...processing_utils import ImagesKwargs, Unpack from ...utils import TensorType, auto_docstring -from .image_processing_paddleocr_vl import PaddleOCRVLImageProcessorKwargs def smart_resize( @@ -68,6 +67,23 @@ def smart_resize( return h_bar, w_bar +class PaddleOCRVLImageProcessorKwargs(ImagesKwargs, total=False): + r""" + patch_size (`int`, *optional*, defaults to 14): + The spatial patch size of the vision encoder. + temporal_patch_size (`int`, *optional*, defaults to 1): + The temporal patch size of the vision encoder. + merge_size (`int`, *optional*, defaults to 2): + The merge size of the vision encoder to llm encoder. + """ + + min_pixels: int + max_pixels: int + patch_size: int + temporal_patch_size: int + merge_size: int + + @auto_docstring class PaddleOCRVLImageProcessorPil(PilBackend): do_resize = True diff --git a/src/transformers/models/rt_detr/image_processing_pil_rt_detr.py b/src/transformers/models/rt_detr/image_processing_pil_rt_detr.py index 606b5640602c..669843e9f949 100644 --- a/src/transformers/models/rt_detr/image_processing_pil_rt_detr.py +++ b/src/transformers/models/rt_detr/image_processing_pil_rt_detr.py @@ -46,10 +46,9 @@ get_max_height_width, validate_annotations, ) -from ...processing_utils import Unpack +from ...processing_utils import ImagesKwargs, Unpack from ...utils import TensorType, auto_docstring, is_torch_available, requires_backends from ...utils.import_utils import requires -from .image_processing_rt_detr import RTDetrImageProcessorKwargs if is_torch_available(): @@ -58,6 +57,20 @@ SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) +class RTDetrImageProcessorKwargs(ImagesKwargs, total=False): + r""" + format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): + Data format of the annotations. One of "coco_detection" or "coco_panoptic". + do_convert_annotations (`bool`, *optional*, defaults to `True`): + Controls whether to convert the annotations to the format expected by the RT_DETR model. Converts the + bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. + Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. + """ + + format: str | AnnotationFormat + do_convert_annotations: bool + + def prepare_coco_detection_annotation_pil( image, target, diff --git a/src/transformers/models/segformer/image_processing_pil_segformer.py b/src/transformers/models/segformer/image_processing_pil_segformer.py index 77514873c59a..7bffa8ab490f 100644 --- a/src/transformers/models/segformer/image_processing_pil_segformer.py +++ b/src/transformers/models/segformer/image_processing_pil_segformer.py @@ -31,10 +31,9 @@ PILImageResampling, SizeDict, ) -from ...processing_utils import Unpack +from ...processing_utils import ImagesKwargs, Unpack from ...utils import TensorType, auto_docstring, is_torch_available, is_torchvision_available from ...utils.import_utils import requires -from .image_processing_segformer import SegformerImageProcessorKwargs if is_torch_available(): @@ -210,4 +209,15 @@ def post_process_semantic_segmentation(self, outputs, target_sizes: list[tuple] return semantic_segmentation +class SegformerImageProcessorKwargs(ImagesKwargs, total=False): + r""" + do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`): + Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 + is used for background, and background itself is not included in all classes of a dataset (e.g. + ADE20k). The background label will be replaced by 255. + """ + + do_reduce_labels: bool + + __all__ = ["SegformerImageProcessorPil"] diff --git a/src/transformers/models/smolvlm/image_processing_pil_smolvlm.py b/src/transformers/models/smolvlm/image_processing_pil_smolvlm.py index 29f3a89f3418..dea8fad98b32 100644 --- a/src/transformers/models/smolvlm/image_processing_pil_smolvlm.py +++ b/src/transformers/models/smolvlm/image_processing_pil_smolvlm.py @@ -35,9 +35,8 @@ SizeDict, make_nested_list_of_images, ) -from ...processing_utils import Unpack +from ...processing_utils import ImagesKwargs, Unpack from ...utils import TensorType, auto_docstring -from .image_processing_smolvlm import SmolVLMImageProcessorKwargs def _make_pixel_mask(image: np.ndarray, output_size: tuple[int, int]) -> np.ndarray: @@ -48,6 +47,22 @@ def _make_pixel_mask(image: np.ndarray, output_size: tuple[int, int]) -> np.ndar return mask +class SmolVLMImageProcessorKwargs(ImagesKwargs, total=False): + """ + do_image_splitting (`bool`, *optional*, defaults to `True`): + Whether to split the image into sub-images concatenated with the original image. They are split into patches + such that each patch has a size of `max_image_size["height"]` x `max_image_size["width"]`. + max_image_size (`Dict`, *optional*, defaults to `{"longest_edge": 364}`): + Maximum resolution of the patches of images accepted by the model. This is a dictionary containing the key "longest_edge". + return_row_col_info (`bool`, *optional*, defaults to `False`): + Whether to return the row and column information of the images. + """ + + do_image_splitting: bool + max_image_size: dict[str, int] + return_row_col_info: bool + + # Adapted from transformers.models.smolvlm.image_processing_smolvlm.MAX_IMAGE_SIZE MAX_IMAGE_SIZE = 4096 # 4k resolution as absolute maximum diff --git a/src/transformers/models/video_llama_3/image_processing_pil_video_llama_3.py b/src/transformers/models/video_llama_3/image_processing_pil_video_llama_3.py index 46f1cbb7d25d..a48e79e09936 100644 --- a/src/transformers/models/video_llama_3/image_processing_pil_video_llama_3.py +++ b/src/transformers/models/video_llama_3/image_processing_pil_video_llama_3.py @@ -26,9 +26,8 @@ from ...feature_extraction_utils import BatchFeature from ...image_processing_backends import PilBackend from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, ImageInput, PILImageResampling, SizeDict -from ...processing_utils import Unpack +from ...processing_utils import ImagesKwargs, Unpack from ...utils import TensorType, auto_docstring -from .image_processing_video_llama_3 import VideoLlama3ImageProcessorKwargs def smart_resize( @@ -60,6 +59,27 @@ def smart_resize( return h_bar, w_bar +class VideoLlama3ImageProcessorKwargs(ImagesKwargs, total=False): + r""" + min_pixels (`int`, *optional*, defaults to `56 * 56`): + The min pixels of the image to resize the image. + max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`): + The max pixels of the image to resize the image. + patch_size (`int`, *optional*, defaults to 14): + The spatial patch size of the vision encoder. + temporal_patch_size (`int`, *optional*, defaults to 2): + The temporal patch size of the vision encoder. + merge_size (`int`, *optional*, defaults to 2): + The merge size of the vision encoder to llm encoder. + """ + + min_pixels: int + max_pixels: int + patch_size: int + temporal_patch_size: int + merge_size: int + + @auto_docstring class VideoLlama3ImageProcessorPil(PilBackend): do_resize = True diff --git a/src/transformers/models/yolos/image_processing_pil_yolos.py b/src/transformers/models/yolos/image_processing_pil_yolos.py index 7f5b8385d8b9..f42fb5a63701 100644 --- a/src/transformers/models/yolos/image_processing_pil_yolos.py +++ b/src/transformers/models/yolos/image_processing_pil_yolos.py @@ -33,10 +33,9 @@ get_max_height_width, validate_annotations, ) -from ...processing_utils import Unpack +from ...processing_utils import ImagesKwargs, Unpack from ...utils import TensorType, auto_docstring, is_torch_available, is_vision_available, requires_backends from ...utils.import_utils import requires -from .image_processing_yolos import YolosImageProcessorKwargs if is_vision_available(): @@ -48,6 +47,20 @@ SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) +class YolosImageProcessorKwargs(ImagesKwargs, total=False): + r""" + format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): + Data format of the annotations. One of "coco_detection" or "coco_panoptic". + do_convert_annotations (`bool`, *optional*, defaults to `True`): + Controls whether to convert the annotations to the format expected by the YOLOS model. Converts the + bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. + Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. + """ + + format: str | AnnotationFormat + do_convert_annotations: bool + + # inspired by https://github.com/facebookresearch/yolos/blob/master/datasets/coco.py#L33 def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray: """ diff --git a/utils/modular_model_converter.py b/utils/modular_model_converter.py index d5dc7dfe23b6..f1e887dedf44 100644 --- a/utils/modular_model_converter.py +++ b/utils/modular_model_converter.py @@ -1303,6 +1303,35 @@ def _code(node: cst.CSTNode) -> str: return other_imports + result +def replace_unprotected_image_processing_imports(files: dict, all_imports: list) -> dict: + """ + Because `image_processing` file uses non-protected torchvision and torch imports, we need to duplicate the nodes + inside `image_processing_pil` instead of importing them directly from `.image_processing_xxx`, which would crash if + torchvision is not installed. + """ + if not ("image_processing" in files and "image_processing_pil" in files): + return files + + body = files["image_processing_pil"] + needed_imports = get_needed_imports(body, all_imports) + import_from_image_processing = None + for import_node in needed_imports: + if isinstance(import_node, cst.SimpleStatementLine) and isinstance(import_node.body[0], cst.ImportFrom): + import_node = import_node.body[0] + full_name = get_full_attribute_name(import_node.module) + if re.search(r"^image_processing_(?!(?:backends)|(?:utils))", full_name): + import_from_image_processing = import_node + break + + if import_from_image_processing is None: + return files + + imported_objects = [x.name.value for x in import_from_image_processing.names] + # Add the nodes inside the body of `image_processing_pil` + body.update({name: files["image_processing"][name] for name in imported_objects}) + return files + + def split_all_assignment(node: cst.CSTNode, model_name: str) -> dict[str, cst.CSTNode]: """Split the `__all__` assignment found in the modular between each corresponding files.""" all_all_per_file = {} @@ -1845,6 +1874,11 @@ def create_modules( all_imports.extend(new_imports) all_imports_code.update(new_imports_code) + # Because `image_processing` file uses non-protected torchvision and torch imports, we need to duplicate the nodes + # here instead of importing from `.image_processing_model`, which would crash if torchvision is not installed + if "image_processing" in files and "image_processing_pil" in files: + files = replace_unprotected_image_processing_imports(files, all_imports) + # Find the correct imports, and write the new modules for file, body in files.items(): new_body = [k[1]["node"] for k in sorted(body.items(), key=lambda x: x[1]["insert_idx"])] From bac493376b9edfa3fe93b2c4aea5c291625a0a5c Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Tue, 21 Apr 2026 18:27:55 +0900 Subject: [PATCH 7/9] fix order --- .../image_processing_pil_deepseek_vl.py | 20 +++---- ...image_processing_pil_deepseek_vl_hybrid.py | 52 +++++++++---------- .../image_processing_pil_llava_onevision.py | 22 ++++---- .../image_processing_pil_segformer.py | 22 ++++---- utils/modular_model_converter.py | 14 ++++- 5 files changed, 71 insertions(+), 59 deletions(-) diff --git a/src/transformers/models/deepseek_vl/image_processing_pil_deepseek_vl.py b/src/transformers/models/deepseek_vl/image_processing_pil_deepseek_vl.py index 6e2a220e3fd2..e868830b0220 100644 --- a/src/transformers/models/deepseek_vl/image_processing_pil_deepseek_vl.py +++ b/src/transformers/models/deepseek_vl/image_processing_pil_deepseek_vl.py @@ -36,6 +36,16 @@ from ...utils import TensorType, auto_docstring +class DeepseekVLImageProcessorKwargs(ImagesKwargs, total=False): + r""" + min_size (`int`, *optional*, defaults to 14): + The minimum allowed size for the resized image. Ensures that neither the height nor width + falls below this value after resizing. + """ + + min_size: int + + @auto_docstring class DeepseekVLImageProcessorPil(PilBackend): resample = PILImageResampling.BICUBIC @@ -161,14 +171,4 @@ def postprocess(self): raise AttributeError("Not needed for DeepseekVL") -class DeepseekVLImageProcessorKwargs(ImagesKwargs, total=False): - r""" - min_size (`int`, *optional*, defaults to 14): - The minimum allowed size for the resized image. Ensures that neither the height nor width - falls below this value after resizing. - """ - - min_size: int - - __all__ = ["DeepseekVLImageProcessorPil"] diff --git a/src/transformers/models/deepseek_vl_hybrid/image_processing_pil_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/image_processing_pil_deepseek_vl_hybrid.py index b1ea56d48a46..55573c35c423 100644 --- a/src/transformers/models/deepseek_vl_hybrid/image_processing_pil_deepseek_vl_hybrid.py +++ b/src/transformers/models/deepseek_vl_hybrid/image_processing_pil_deepseek_vl_hybrid.py @@ -38,6 +38,32 @@ from ...utils import TensorType, auto_docstring +class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs, total=False): + r""" + min_size (`int`, *optional*, defaults to 14): + The minimum allowed size for the resized image. Ensures that neither the height nor width + falls below this value after resizing. + high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`): + Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess` + method. + high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be + overridden by the `high_res_resample` parameter in the `preprocess` method. + high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`): + Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method. + high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`): + Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method. + """ + + min_size: int + high_res_size: dict + high_res_resample: Union["PILImageResampling", int] + high_res_image_mean: float | list[float] | tuple[float, ...] + high_res_image_std: float | list[float] | tuple[float, ...] + + @auto_docstring class DeepseekVLHybridImageProcessorPil(PilBackend): resample = PILImageResampling.BICUBIC @@ -232,30 +258,4 @@ def _standardize_kwargs( return kwargs -class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs, total=False): - r""" - min_size (`int`, *optional*, defaults to 14): - The minimum allowed size for the resized image. Ensures that neither the height nor width - falls below this value after resizing. - high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`): - Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess` - method. - high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): - Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be - overridden by the `high_res_resample` parameter in the `preprocess` method. - high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`): - Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of - channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method. - high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`): - Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the - number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method. - """ - - min_size: int - high_res_size: dict - high_res_resample: Union["PILImageResampling", int] - high_res_image_mean: float | list[float] | tuple[float, ...] - high_res_image_std: float | list[float] | tuple[float, ...] - - __all__ = ["DeepseekVLHybridImageProcessorPil"] diff --git a/src/transformers/models/llava_onevision/image_processing_pil_llava_onevision.py b/src/transformers/models/llava_onevision/image_processing_pil_llava_onevision.py index 96a973ead67d..23534a65d70f 100644 --- a/src/transformers/models/llava_onevision/image_processing_pil_llava_onevision.py +++ b/src/transformers/models/llava_onevision/image_processing_pil_llava_onevision.py @@ -36,6 +36,17 @@ from ...utils import TensorType, auto_docstring +class LlavaOnevisionImageProcessorKwargs(ImagesKwargs, total=False): + r""" + image_grid_pinpoints (`list[list[int]]`, *optional*): + A list of possible resolutions to use for processing high resolution images. The best resolution is selected + based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess` + method. + """ + + image_grid_pinpoints: list[list[int]] + + @auto_docstring class LlavaOnevisionImageProcessorPil(PilBackend): model_input_names = ["pixel_values", "image_sizes", "batch_num_images"] @@ -293,15 +304,4 @@ def pad_to_square( return result -class LlavaOnevisionImageProcessorKwargs(ImagesKwargs, total=False): - r""" - image_grid_pinpoints (`list[list[int]]`, *optional*): - A list of possible resolutions to use for processing high resolution images. The best resolution is selected - based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess` - method. - """ - - image_grid_pinpoints: list[list[int]] - - __all__ = ["LlavaOnevisionImageProcessorPil"] diff --git a/src/transformers/models/segformer/image_processing_pil_segformer.py b/src/transformers/models/segformer/image_processing_pil_segformer.py index 7bffa8ab490f..f1d0bb0f627b 100644 --- a/src/transformers/models/segformer/image_processing_pil_segformer.py +++ b/src/transformers/models/segformer/image_processing_pil_segformer.py @@ -42,6 +42,17 @@ import torchvision.transforms.v2.functional as tvF +class SegformerImageProcessorKwargs(ImagesKwargs, total=False): + r""" + do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`): + Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 + is used for background, and background itself is not included in all classes of a dataset (e.g. + ADE20k). The background label will be replaced by 255. + """ + + do_reduce_labels: bool + + @requires(backends=("torch", "torchvision")) class SegformerImageProcessorPil(PilBackend): """PIL backend for Segformer with reduce_label support.""" @@ -209,15 +220,4 @@ def post_process_semantic_segmentation(self, outputs, target_sizes: list[tuple] return semantic_segmentation -class SegformerImageProcessorKwargs(ImagesKwargs, total=False): - r""" - do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`): - Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 - is used for background, and background itself is not included in all classes of a dataset (e.g. - ADE20k). The background label will be replaced by 255. - """ - - do_reduce_labels: bool - - __all__ = ["SegformerImageProcessorPil"] diff --git a/utils/modular_model_converter.py b/utils/modular_model_converter.py index f1e887dedf44..4ef6fe374e7a 100644 --- a/utils/modular_model_converter.py +++ b/utils/modular_model_converter.py @@ -1327,8 +1327,20 @@ def replace_unprotected_image_processing_imports(files: dict, all_imports: list) return files imported_objects = [x.name.value for x in import_from_image_processing.names] + nodes_to_add = {name: files["image_processing"][name] for name in imported_objects} + # Update the position inside the final file + for name, node_structure in nodes_to_add.items(): + node_with_same_index = next( + v["node"] for v in body.values() if v["insert_idx"] == node_structure["insert_idx"] + ) + # Insert the new node before the corresponding node if the corresponding node is a class + if isinstance(node_with_same_index, cst.ClassDef): + nodes_to_add[name]["insert_idx"] -= 0.5 + # Otherwise, after it + else: + nodes_to_add[name]["insert_idx"] += 0.5 # Add the nodes inside the body of `image_processing_pil` - body.update({name: files["image_processing"][name] for name in imported_objects}) + body.update(nodes_to_add) return files From 58ed72b7da9e2a06385c4480f9457a64f45e2440 Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Tue, 21 Apr 2026 18:34:38 +0900 Subject: [PATCH 8/9] fix order --- .../image_processing_pil_efficientloftr.py | 12 ++--- .../image_processing_pil_ernie4_5_vl_moe.py | 30 +++++------ .../glm46v/image_processing_pil_glm46v.py | 30 +++++------ .../image_processing_pil_glm_image.py | 42 +++++++-------- .../image_processing_pil_lightglue.py | 12 ++--- .../image_processing_pil_mask2former.py | 52 +++++++++---------- .../image_processing_pil_paddleocr_vl.py | 34 ++++++------ .../smolvlm/image_processing_pil_smolvlm.py | 16 +++--- .../image_processing_pil_video_llama_3.py | 42 +++++++-------- utils/modular_model_converter.py | 4 +- 10 files changed, 137 insertions(+), 137 deletions(-) diff --git a/src/transformers/models/efficientloftr/image_processing_pil_efficientloftr.py b/src/transformers/models/efficientloftr/image_processing_pil_efficientloftr.py index 7c42d75f2baa..5f467c56dd4f 100644 --- a/src/transformers/models/efficientloftr/image_processing_pil_efficientloftr.py +++ b/src/transformers/models/efficientloftr/image_processing_pil_efficientloftr.py @@ -32,12 +32,6 @@ import torch -def is_grayscale(image: np.ndarray): - if image.shape[0] == 1: - return True - return np.all(image[0, ...] == image[1, ...]) and np.all(image[1, ...] == image[2, ...]) - - class EfficientLoFTRImageProcessorKwargs(ImagesKwargs, total=False): r""" do_grayscale (`bool`, *optional*, defaults to `self.do_grayscale`): @@ -47,6 +41,12 @@ class EfficientLoFTRImageProcessorKwargs(ImagesKwargs, total=False): do_grayscale: bool +def is_grayscale(image: np.ndarray): + if image.shape[0] == 1: + return True + return np.all(image[0, ...] == image[1, ...]) and np.all(image[1, ...] == image[2, ...]) + + def convert_to_grayscale(image: ImageInput) -> ImageInput: """ Converts an image to grayscale format using the NTSC formula. Only support numpy and PIL Image. diff --git a/src/transformers/models/ernie4_5_vl_moe/image_processing_pil_ernie4_5_vl_moe.py b/src/transformers/models/ernie4_5_vl_moe/image_processing_pil_ernie4_5_vl_moe.py index 4b6db850f8da..7f372c3af02d 100644 --- a/src/transformers/models/ernie4_5_vl_moe/image_processing_pil_ernie4_5_vl_moe.py +++ b/src/transformers/models/ernie4_5_vl_moe/image_processing_pil_ernie4_5_vl_moe.py @@ -32,6 +32,21 @@ logger = logging.get_logger(__name__) +class Ernie4_5_VLMoeImageProcessorKwargs(ImagesKwargs, total=False): + r""" + patch_size (`int`, *optional*, defaults to 14): + The spatial patch size of the vision encoder. + temporal_patch_size (`int`, *optional*): + The temporal patch size of the vision encoder. Unused in the image processor, only used for videos. + merge_size (`int`, *optional*, defaults to 2): + The merge size of the vision encoder to llm encoder. + """ + + patch_size: int + temporal_patch_size: int + merge_size: int + + def smart_resize( height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 1280 ): @@ -61,21 +76,6 @@ def smart_resize( return h_bar, w_bar -class Ernie4_5_VLMoeImageProcessorKwargs(ImagesKwargs, total=False): - r""" - patch_size (`int`, *optional*, defaults to 14): - The spatial patch size of the vision encoder. - temporal_patch_size (`int`, *optional*): - The temporal patch size of the vision encoder. Unused in the image processor, only used for videos. - merge_size (`int`, *optional*, defaults to 2): - The merge size of the vision encoder to llm encoder. - """ - - patch_size: int - temporal_patch_size: int - merge_size: int - - @auto_docstring class Ernie4_5_VLMoeImageProcessorPil(PilBackend): do_resize = True diff --git a/src/transformers/models/glm46v/image_processing_pil_glm46v.py b/src/transformers/models/glm46v/image_processing_pil_glm46v.py index 5070535f6ecf..5601e732c2b3 100644 --- a/src/transformers/models/glm46v/image_processing_pil_glm46v.py +++ b/src/transformers/models/glm46v/image_processing_pil_glm46v.py @@ -30,6 +30,21 @@ from ...utils import TensorType, auto_docstring +class Glm46VImageProcessorKwargs(ImagesKwargs, total=False): + """ + patch_size (`int`, *optional*, defaults to 14): + The spatial patch size of the vision encoder. + temporal_patch_size (`int`, *optional*, defaults to 2): + The temporal patch size of the vision encoder. + merge_size (`int`, *optional*, defaults to 2): + The merge size of the vision encoder to llm encoder. + """ + + patch_size: int + temporal_patch_size: int + merge_size: int + + # Adapted from transformers.models.glm46v.image_processing_glm46v.smart_resize def smart_resize( num_frames: int, @@ -67,21 +82,6 @@ def smart_resize( return h_bar, w_bar -class Glm46VImageProcessorKwargs(ImagesKwargs, total=False): - """ - patch_size (`int`, *optional*, defaults to 14): - The spatial patch size of the vision encoder. - temporal_patch_size (`int`, *optional*, defaults to 2): - The temporal patch size of the vision encoder. - merge_size (`int`, *optional*, defaults to 2): - The merge size of the vision encoder to llm encoder. - """ - - patch_size: int - temporal_patch_size: int - merge_size: int - - @auto_docstring class Glm46VImageProcessorPil(PilBackend): do_resize = True diff --git a/src/transformers/models/glm_image/image_processing_pil_glm_image.py b/src/transformers/models/glm_image/image_processing_pil_glm_image.py index 7861cc32a1ae..355bb04adb67 100644 --- a/src/transformers/models/glm_image/image_processing_pil_glm_image.py +++ b/src/transformers/models/glm_image/image_processing_pil_glm_image.py @@ -30,6 +30,27 @@ from ...utils import TensorType, auto_docstring +class GlmImageImageProcessorKwargs(ImagesKwargs, total=False): + r""" + min_pixels (`int`, *optional*, defaults to `56 * 56`): + The min pixels of the image to resize the image. + max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`): + The max pixels of the image to resize the image. + patch_size (`int`, *optional*, defaults to 14): + The spatial patch size of the vision encoder. + temporal_patch_size (`int`, *optional*, defaults to 2): + The temporal patch size of the vision encoder. + merge_size (`int`, *optional*, defaults to 2): + The merge size of the vision encoder to llm encoder. + """ + + min_pixels: int + max_pixels: int + patch_size: int + temporal_patch_size: int + merge_size: int + + def smart_resize( height: int, width: int, @@ -71,27 +92,6 @@ def smart_resize( return h_bar, w_bar -class GlmImageImageProcessorKwargs(ImagesKwargs, total=False): - r""" - min_pixels (`int`, *optional*, defaults to `56 * 56`): - The min pixels of the image to resize the image. - max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`): - The max pixels of the image to resize the image. - patch_size (`int`, *optional*, defaults to 14): - The spatial patch size of the vision encoder. - temporal_patch_size (`int`, *optional*, defaults to 2): - The temporal patch size of the vision encoder. - merge_size (`int`, *optional*, defaults to 2): - The merge size of the vision encoder to llm encoder. - """ - - min_pixels: int - max_pixels: int - patch_size: int - temporal_patch_size: int - merge_size: int - - @auto_docstring class GlmImageImageProcessorPil(PilBackend): do_resize = True diff --git a/src/transformers/models/lightglue/image_processing_pil_lightglue.py b/src/transformers/models/lightglue/image_processing_pil_lightglue.py index 77389f8e8da3..9f43fe1bbc7a 100644 --- a/src/transformers/models/lightglue/image_processing_pil_lightglue.py +++ b/src/transformers/models/lightglue/image_processing_pil_lightglue.py @@ -46,12 +46,6 @@ import torch -def is_grayscale(image: np.ndarray): - if image.shape[0] == 1: - return True - return np.all(image[0, ...] == image[1, ...]) and np.all(image[1, ...] == image[2, ...]) - - class LightGlueImageProcessorKwargs(ImagesKwargs, total=False): r""" do_grayscale (`bool`, *optional*, defaults to `self.do_grayscale`): @@ -61,6 +55,12 @@ class LightGlueImageProcessorKwargs(ImagesKwargs, total=False): do_grayscale: bool +def is_grayscale(image: np.ndarray): + if image.shape[0] == 1: + return True + return np.all(image[0, ...] == image[1, ...]) and np.all(image[1, ...] == image[2, ...]) + + def convert_to_grayscale(image: ImageInput) -> ImageInput: """ Converts an image to grayscale format using the NTSC formula. Only support numpy and PIL Image. diff --git a/src/transformers/models/mask2former/image_processing_pil_mask2former.py b/src/transformers/models/mask2former/image_processing_pil_mask2former.py index 6b27657b3677..8358a3601bed 100644 --- a/src/transformers/models/mask2former/image_processing_pil_mask2former.py +++ b/src/transformers/models/mask2former/image_processing_pil_mask2former.py @@ -51,6 +51,32 @@ logger = logging.get_logger(__name__) +class Mask2FormerImageProcessorKwargs(ImagesKwargs, total=False): + r""" + ignore_index (`int`, *optional*): + Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels + denoted with 0 (background) will be replaced with `ignore_index`. + do_reduce_labels (`bool`, *optional*, defaults to `False`): + Whether or not to decrement all label values of segmentation maps by 1. Usually used for datasets where 0 + is used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k). + The background label will be replaced by `ignore_index`. + num_labels (`int`, *optional*): + The number of labels in the segmentation map. + size_divisor (`int`, *optional*, defaults to `32`): + Some backbones need images divisible by a certain number. If not passed, it defaults to the value used in + Swin Transformer. + pad_size (`SizeDict`, *optional*): + The size to pad the images to. Must be larger than any image size provided for preprocessing. If `pad_size` + is not provided, images will be padded to the largest height and width in the batch. + """ + + ignore_index: int | None + do_reduce_labels: bool + num_labels: int | None + size_divisor: int + pad_size: SizeDict | None + + def convert_segmentation_map_to_binary_masks( segmentation_map: np.ndarray, instance_id_to_semantic_id: dict[int, int] | None = None, @@ -87,32 +113,6 @@ def convert_segmentation_map_to_binary_masks( return binary_masks.astype(np.float32), labels -class Mask2FormerImageProcessorKwargs(ImagesKwargs, total=False): - r""" - ignore_index (`int`, *optional*): - Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels - denoted with 0 (background) will be replaced with `ignore_index`. - do_reduce_labels (`bool`, *optional*, defaults to `False`): - Whether or not to decrement all label values of segmentation maps by 1. Usually used for datasets where 0 - is used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k). - The background label will be replaced by `ignore_index`. - num_labels (`int`, *optional*): - The number of labels in the segmentation map. - size_divisor (`int`, *optional*, defaults to `32`): - Some backbones need images divisible by a certain number. If not passed, it defaults to the value used in - Swin Transformer. - pad_size (`SizeDict`, *optional*): - The size to pad the images to. Must be larger than any image size provided for preprocessing. If `pad_size` - is not provided, images will be padded to the largest height and width in the batch. - """ - - ignore_index: int | None - do_reduce_labels: bool - num_labels: int | None - size_divisor: int - pad_size: SizeDict | None - - # Adapted from transformers.models.mask2former.image_processing_mask2former.binary_mask_to_rle def binary_mask_to_rle(mask): """ diff --git a/src/transformers/models/paddleocr_vl/image_processing_pil_paddleocr_vl.py b/src/transformers/models/paddleocr_vl/image_processing_pil_paddleocr_vl.py index c524acc0debc..ac639892640f 100644 --- a/src/transformers/models/paddleocr_vl/image_processing_pil_paddleocr_vl.py +++ b/src/transformers/models/paddleocr_vl/image_processing_pil_paddleocr_vl.py @@ -35,6 +35,23 @@ from ...utils import TensorType, auto_docstring +class PaddleOCRVLImageProcessorKwargs(ImagesKwargs, total=False): + r""" + patch_size (`int`, *optional*, defaults to 14): + The spatial patch size of the vision encoder. + temporal_patch_size (`int`, *optional*, defaults to 1): + The temporal patch size of the vision encoder. + merge_size (`int`, *optional*, defaults to 2): + The merge size of the vision encoder to llm encoder. + """ + + min_pixels: int + max_pixels: int + patch_size: int + temporal_patch_size: int + merge_size: int + + def smart_resize( height: int, width: int, @@ -67,23 +84,6 @@ def smart_resize( return h_bar, w_bar -class PaddleOCRVLImageProcessorKwargs(ImagesKwargs, total=False): - r""" - patch_size (`int`, *optional*, defaults to 14): - The spatial patch size of the vision encoder. - temporal_patch_size (`int`, *optional*, defaults to 1): - The temporal patch size of the vision encoder. - merge_size (`int`, *optional*, defaults to 2): - The merge size of the vision encoder to llm encoder. - """ - - min_pixels: int - max_pixels: int - patch_size: int - temporal_patch_size: int - merge_size: int - - @auto_docstring class PaddleOCRVLImageProcessorPil(PilBackend): do_resize = True diff --git a/src/transformers/models/smolvlm/image_processing_pil_smolvlm.py b/src/transformers/models/smolvlm/image_processing_pil_smolvlm.py index dea8fad98b32..3d53ed09c11f 100644 --- a/src/transformers/models/smolvlm/image_processing_pil_smolvlm.py +++ b/src/transformers/models/smolvlm/image_processing_pil_smolvlm.py @@ -39,14 +39,6 @@ from ...utils import TensorType, auto_docstring -def _make_pixel_mask(image: np.ndarray, output_size: tuple[int, int]) -> np.ndarray: - """Make pixel mask: 1=valid, 0=padding. Images are CHW.""" - h, w = image.shape[-2:] - mask = np.zeros(output_size, dtype=np.int64) - mask[:h, :w] = 1 - return mask - - class SmolVLMImageProcessorKwargs(ImagesKwargs, total=False): """ do_image_splitting (`bool`, *optional*, defaults to `True`): @@ -63,6 +55,14 @@ class SmolVLMImageProcessorKwargs(ImagesKwargs, total=False): return_row_col_info: bool +def _make_pixel_mask(image: np.ndarray, output_size: tuple[int, int]) -> np.ndarray: + """Make pixel mask: 1=valid, 0=padding. Images are CHW.""" + h, w = image.shape[-2:] + mask = np.zeros(output_size, dtype=np.int64) + mask[:h, :w] = 1 + return mask + + # Adapted from transformers.models.smolvlm.image_processing_smolvlm.MAX_IMAGE_SIZE MAX_IMAGE_SIZE = 4096 # 4k resolution as absolute maximum diff --git a/src/transformers/models/video_llama_3/image_processing_pil_video_llama_3.py b/src/transformers/models/video_llama_3/image_processing_pil_video_llama_3.py index a48e79e09936..5272c7465b2b 100644 --- a/src/transformers/models/video_llama_3/image_processing_pil_video_llama_3.py +++ b/src/transformers/models/video_llama_3/image_processing_pil_video_llama_3.py @@ -30,6 +30,27 @@ from ...utils import TensorType, auto_docstring +class VideoLlama3ImageProcessorKwargs(ImagesKwargs, total=False): + r""" + min_pixels (`int`, *optional*, defaults to `56 * 56`): + The min pixels of the image to resize the image. + max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`): + The max pixels of the image to resize the image. + patch_size (`int`, *optional*, defaults to 14): + The spatial patch size of the vision encoder. + temporal_patch_size (`int`, *optional*, defaults to 2): + The temporal patch size of the vision encoder. + merge_size (`int`, *optional*, defaults to 2): + The merge size of the vision encoder to llm encoder. + """ + + min_pixels: int + max_pixels: int + patch_size: int + temporal_patch_size: int + merge_size: int + + def smart_resize( height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 1280 ): @@ -59,27 +80,6 @@ def smart_resize( return h_bar, w_bar -class VideoLlama3ImageProcessorKwargs(ImagesKwargs, total=False): - r""" - min_pixels (`int`, *optional*, defaults to `56 * 56`): - The min pixels of the image to resize the image. - max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`): - The max pixels of the image to resize the image. - patch_size (`int`, *optional*, defaults to 14): - The spatial patch size of the vision encoder. - temporal_patch_size (`int`, *optional*, defaults to 2): - The temporal patch size of the vision encoder. - merge_size (`int`, *optional*, defaults to 2): - The merge size of the vision encoder to llm encoder. - """ - - min_pixels: int - max_pixels: int - patch_size: int - temporal_patch_size: int - merge_size: int - - @auto_docstring class VideoLlama3ImageProcessorPil(PilBackend): do_resize = True diff --git a/utils/modular_model_converter.py b/utils/modular_model_converter.py index 4ef6fe374e7a..018316680ece 100644 --- a/utils/modular_model_converter.py +++ b/utils/modular_model_converter.py @@ -1333,8 +1333,8 @@ def replace_unprotected_image_processing_imports(files: dict, all_imports: list) node_with_same_index = next( v["node"] for v in body.values() if v["insert_idx"] == node_structure["insert_idx"] ) - # Insert the new node before the corresponding node if the corresponding node is a class - if isinstance(node_with_same_index, cst.ClassDef): + # Insert the new node before the corresponding node if the corresponding node is a class or function + if isinstance(node_with_same_index, (cst.ClassDef, cst.FunctionDef)): nodes_to_add[name]["insert_idx"] -= 0.5 # Otherwise, after it else: From 05e1b5b243cd40348661bfcc3d47e02160e9eca5 Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Tue, 21 Apr 2026 18:36:55 +0900 Subject: [PATCH 9/9] revert --- src/transformers/dependency_versions_table.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 0456904dd3d5..b08aa558d795 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -56,7 +56,6 @@ "rjieba": "rjieba", "rouge-score": "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1", "ruff": "ruff==0.14.10", - "transformers-mlinter": "transformers-mlinter @ git+https://github.com/huggingface/transformers-mlinter@b9d319ce264c106f97a959d926ef42bc3c0ea4d1", "ty": "ty==0.0.20", "sacrebleu": "sacrebleu>=1.4.12,<2.0.0", "sacremoses": "sacremoses",