Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@

logger = logging.get_logger(__name__)

SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)


class ConditionalDetrImageProcessorKwargs(ImagesKwargs, total=False):
r"""
Expand All @@ -76,9 +78,6 @@ class ConditionalDetrImageProcessorKwargs(ImagesKwargs, total=False):
do_convert_annotations: bool


SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)


# inspired by https://github.com/facebookresearch/conditional_detr/blob/master/datasets/coco.py#L33
def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray:
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,12 @@
from ...image_transforms import (
center_to_corners_format,
)
from ...image_utils import AnnotationFormat
from ...masking_utils import create_bidirectional_mask
from ...modeling_outputs import (
BaseModelOutput,
)
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
from ...processing_utils import ImagesKwargs, Unpack
from ...processing_utils import Unpack
from ...utils import (
TensorType,
TransformersKwargs,
Expand Down Expand Up @@ -66,20 +65,6 @@
logger = logging.get_logger(__name__)


class ConditionalDetrImageProcessorKwargs(ImagesKwargs, total=False):
r"""
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
do_convert_annotations (`bool`, *optional*, defaults to `True`):
Controls whether to convert the annotations to the format expected by the CONDITIONAL_DETR model. Converts the
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
"""

format: str | AnnotationFormat
do_convert_annotations: bool


class ConditionalDetrImageProcessor(DetrImageProcessor):
def post_process_object_detection(
self, outputs, threshold: float = 0.5, target_sizes: TensorType | list[tuple] = None, top_k: int = 100
Expand Down
12 changes: 1 addition & 11 deletions src/transformers/models/deepseek_vl/modular_deepseek_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from ...configuration_utils import PreTrainedConfig
from ...image_processing_utils import BatchFeature
from ...image_utils import ImageInput
from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
from ...tokenization_utils_base import (
PreTokenizedInput,
TextInput,
Expand Down Expand Up @@ -152,16 +152,6 @@ def generate(self):
raise AttributeError("Not needed for DeepseekVL")


class DeepseekVLImageProcessorKwargs(ImagesKwargs, total=False):
r"""
min_size (`int`, *optional*, defaults to 14):
The minimum allowed size for the resized image. Ensures that neither the height nor width
falls below this value after resizing.
"""

min_size: int


class DeepseekVLImageProcessorPil(JanusImageProcessorPil):
def postprocess(self):
raise AttributeError("Not needed for DeepseekVL")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@
if is_torch_available():
import torch

SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)


class DeformableDetrImageProcessorKwargs(ImagesKwargs, total=False):
r"""
Expand All @@ -72,9 +74,6 @@ class DeformableDetrImageProcessorKwargs(ImagesKwargs, total=False):
do_convert_annotations: bool


SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)


# inspired by https://github.com/facebookresearch/deformable_detr/blob/master/datasets/coco.py#L33
def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray:
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,10 @@
from ... import initialization as init
from ...backbone_utils import load_backbone
from ...image_transforms import center_to_corners_format
from ...image_utils import AnnotationFormat
from ...integrations import use_kernel_forward_from_hub
from ...modeling_outputs import BaseModelOutput
from ...modeling_utils import PreTrainedModel
from ...processing_utils import ImagesKwargs, Unpack
from ...processing_utils import Unpack
from ...utils import (
ModelOutput,
TensorType,
Expand Down Expand Up @@ -61,20 +60,6 @@
logger = logging.get_logger(__name__)


class DeformableDetrImageProcessorKwargs(ImagesKwargs, total=False):
r"""
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
do_convert_annotations (`bool`, *optional*, defaults to `True`):
Controls whether to convert the annotations to the format expected by the DEFORMABLE_DETR model. Converts the
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
"""

format: str | AnnotationFormat
do_convert_annotations: bool


class DeformableDetrImageProcessor(DetrImageProcessor):
def post_process_object_detection(
self, outputs, threshold: float = 0.5, target_sizes: TensorType | list[tuple] = None, top_k: int = 100
Expand Down
10 changes: 0 additions & 10 deletions src/transformers/models/efficientloftr/modular_efficientloftr.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from typing import TYPE_CHECKING

from ...processing_utils import ImagesKwargs
from ...utils import TensorType, is_torch_available
from ...utils.import_utils import requires
from ..superglue.image_processing_pil_superglue import SuperGlueImageProcessorPil
Expand All @@ -14,15 +13,6 @@
from .modeling_efficientloftr import EfficientLoFTRKeypointMatchingOutput


class EfficientLoFTRImageProcessorKwargs(ImagesKwargs, total=False):
r"""
do_grayscale (`bool`, *optional*, defaults to `self.do_grayscale`):
Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
"""

do_grayscale: bool


class EfficientLoFTRImageProcessor(SuperGlueImageProcessor):
def post_process_keypoint_matching(
self,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
from ...modeling_outputs import BaseModelOutputWithPooling, MoeCausalLMOutputWithPast, MoeModelOutputWithPast
from ...modeling_rope_utils import dynamic_rope_update
from ...modeling_utils import PreTrainedModel
from ...processing_utils import ImagesKwargs, Unpack
from ...processing_utils import Unpack
from ...utils import (
TensorType,
TransformersKwargs,
Expand All @@ -63,7 +63,7 @@
Ernie4_5_MoeStatics,
Ernie4_5_MoeTopKRouter,
)
from ..glm4v.image_processing_glm4v import Glm4vImageProcessor
from ..glm4v.image_processing_glm4v import Glm4vImageProcessor, Glm4vImageProcessorKwargs
from ..glm4v.image_processing_pil_glm4v import Glm4vImageProcessorPil
from ..glm4v.modeling_glm4v import Glm4vForConditionalGeneration
from ..mixtral.modeling_mixtral import load_balancing_loss_func
Expand Down Expand Up @@ -1220,7 +1220,7 @@ def forward(
)


class Ernie4_5_VLMoeImageProcessorKwargs(ImagesKwargs, total=False):
class Ernie4_5_VLMoeImageProcessorKwargs(Glm4vImageProcessorKwargs):
r"""
patch_size (`int`, *optional*, defaults to 14):
The spatial patch size of the vision encoder.
Expand All @@ -1230,10 +1230,6 @@ class Ernie4_5_VLMoeImageProcessorKwargs(ImagesKwargs, total=False):
The merge size of the vision encoder to llm encoder.
"""

patch_size: int
temporal_patch_size: int
merge_size: int


class Ernie4_5_VLMoeImageProcessorPil(Glm4vImageProcessorPil):
size = {"shortest_edge": 56 * 56, "longest_edge": 28 * 28 * 6177}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
from ...utils import TensorType, auto_docstring


# Adapted from transformers.models.glm_image.image_processing_glm_image.GlmImageImageProcessorKwargs
class GlmImageImageProcessorKwargs(ImagesKwargs, total=False):
r"""
min_pixels (`int`, *optional*, defaults to `56 * 56`):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@
if is_torch_available():
import torch

SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)


class GroundingDinoImageProcessorKwargs(ImagesKwargs, total=False):
r"""
Expand All @@ -82,9 +84,6 @@ class GroundingDinoImageProcessorKwargs(ImagesKwargs, total=False):
do_convert_annotations: bool


SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)


# inspired by https://github.com/facebookresearch/grounding_dino/blob/master/datasets/coco.py#L33
def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray:
"""
Expand Down
16 changes: 0 additions & 16 deletions src/transformers/models/grounding_dino/modular_grounding_dino.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,6 @@
from transformers.models.detr.image_processing_pil_detr import DetrImageProcessorPil

from ...image_transforms import center_to_corners_format
from ...image_utils import AnnotationFormat
from ...processing_utils import ImagesKwargs
from ...utils import (
TensorType,
logging,
Expand Down Expand Up @@ -70,20 +68,6 @@ def _scale_boxes(boxes, target_sizes):
return boxes


class GroundingDinoImageProcessorKwargs(ImagesKwargs, total=False):
r"""
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
do_convert_annotations (`bool`, *optional*, defaults to `True`):
Controls whether to convert the annotations to the format expected by the GROUNDING_DINO model. Converts the
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
"""

format: str | AnnotationFormat
do_convert_annotations: bool


class GroundingDinoImageProcessor(DetrImageProcessor):
def post_process_object_detection(
self,
Expand Down
13 changes: 4 additions & 9 deletions src/transformers/models/lightglue/modular_lightglue.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from ...configuration_utils import PreTrainedConfig
from ...modeling_flash_attention_utils import FlashAttentionKwargs
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from ...processing_utils import ImagesKwargs, Unpack
from ...processing_utils import Unpack
from ...utils import ModelOutput, TensorType, auto_docstring, can_return_tuple, logging
from ...utils.import_utils import requires
from ..auto import CONFIG_MAPPING, AutoConfig
Expand All @@ -32,7 +32,7 @@
from ..cohere.modeling_cohere import apply_rotary_pos_emb
from ..llama.modeling_llama import LlamaAttention, eager_attention_forward
from ..superglue.image_processing_pil_superglue import SuperGlueImageProcessorPil
from ..superglue.image_processing_superglue import SuperGlueImageProcessor
from ..superglue.image_processing_superglue import SuperGlueImageProcessor, SuperGlueImageProcessorKwargs
from ..superpoint import SuperPointConfig


Expand Down Expand Up @@ -154,13 +154,8 @@ class LightGlueKeypointMatchingOutput(ModelOutput):
attentions: tuple[torch.FloatTensor] | None = None


class LightGlueImageProcessorKwargs(ImagesKwargs, total=False):
r"""
do_grayscale (`bool`, *optional*, defaults to `self.do_grayscale`):
Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
"""

do_grayscale: bool
class LightGlueImageProcessorKwargs(SuperGlueImageProcessorKwargs):
pass


class LightGlueImageProcessor(SuperGlueImageProcessor):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
)
from ...modeling_flash_attention_utils import FlashAttentionKwargs
from ...modeling_outputs import BaseModelOutputWithPooling
from ...processing_utils import ImagesKwargs, Unpack
from ...processing_utils import Unpack
from ...utils import TensorType, auto_docstring, logging
from ...utils.generic import can_return_tuple, merge_with_config_defaults
from ..llava_next.image_processing_llava_next import LlavaNextImageProcessor, LlavaNextImageProcessorKwargs
Expand Down Expand Up @@ -217,17 +217,6 @@ def _preprocess(
)


class LlavaOnevisionImageProcessorKwargs(ImagesKwargs, total=False):
r"""
image_grid_pinpoints (`list[list[int]]`, *optional*):
A list of possible resolutions to use for processing high resolution images. The best resolution is selected
based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess`
method.
"""

image_grid_pinpoints: list[list[int]]


class LlavaOnevisionImageProcessorPil(LlavaNextImageProcessorPil):
resample = PILImageResampling.BICUBIC
image_mean = OPENAI_CLIP_MEAN
Expand Down
28 changes: 0 additions & 28 deletions src/transformers/models/mask2former/modular_mask2former.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@
import torch
from torch import nn

from ...image_utils import SizeDict
from ...processing_utils import ImagesKwargs
from ...utils import (
TensorType,
logging,
Expand All @@ -35,32 +33,6 @@
logger = logging.get_logger(__name__)


class Mask2FormerImageProcessorKwargs(ImagesKwargs, total=False):
r"""
ignore_index (`int`, *optional*):
Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
denoted with 0 (background) will be replaced with `ignore_index`.
do_reduce_labels (`bool`, *optional*, defaults to `False`):
Whether or not to decrement all label values of segmentation maps by 1. Usually used for datasets where 0
is used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k).
The background label will be replaced by `ignore_index`.
num_labels (`int`, *optional*):
The number of labels in the segmentation map.
size_divisor (`int`, *optional*, defaults to `32`):
Some backbones need images divisible by a certain number. If not passed, it defaults to the value used in
Swin Transformer.
pad_size (`SizeDict`, *optional*):
The size to pad the images to. Must be larger than any image size provided for preprocessing. If `pad_size`
is not provided, images will be padded to the largest height and width in the batch.
"""

ignore_index: int | None
do_reduce_labels: bool
num_labels: int | None
size_divisor: int
pad_size: SizeDict | None


class Mask2FormerImageProcessor(MaskFormerImageProcessor):
def post_process_semantic_segmentation(
self, outputs, target_sizes: list[tuple[int, int]] | None = None
Expand Down
11 changes: 2 additions & 9 deletions src/transformers/models/paddleocr_vl/modular_paddleocr_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,8 @@
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPast, BaseModelOutputWithPooling
from ...modeling_utils import PreTrainedModel
from ...models.qwen2_vl.image_processing_pil_qwen2_vl import Qwen2VLImageProcessorPil
from ...models.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor
from ...models.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor, Qwen2VLImageProcessorKwargs
from ...processing_utils import (
ImagesKwargs,
ProcessingKwargs,
ProcessorMixin,
Unpack,
Expand Down Expand Up @@ -123,7 +122,7 @@ def smart_resize(
return h_bar, w_bar


class PaddleOCRVLImageProcessorKwargs(ImagesKwargs, total=False):
class PaddleOCRVLImageProcessorKwargs(Qwen2VLImageProcessorKwargs):
r"""
patch_size (`int`, *optional*, defaults to 14):
The spatial patch size of the vision encoder.
Expand All @@ -133,12 +132,6 @@ class PaddleOCRVLImageProcessorKwargs(ImagesKwargs, total=False):
The merge size of the vision encoder to llm encoder.
"""

min_pixels: int
max_pixels: int
patch_size: int
temporal_patch_size: int
merge_size: int


class PaddleOCRVLImageProcessorPil(Qwen2VLImageProcessorPil):
size = {"shortest_edge": 384 * 384, "longest_edge": 1536 * 1536}
Expand Down
Loading
Loading