From aea65470dcb19576e9ac8e929f3ecb34cb1af794 Mon Sep 17 00:00:00 2001 From: agamjots Date: Tue, 22 Jul 2025 11:53:56 -0700 Subject: [PATCH 01/33] initial commit --- .../models/imagegpt/image_processing_imagegpt_fast.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 src/transformers/models/imagegpt/image_processing_imagegpt_fast.py diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py new file mode 100644 index 000000000000..c23659b3d5a2 --- /dev/null +++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py @@ -0,0 +1 @@ +#Will add fast image processing for imagegpt \ No newline at end of file From d49fa73af4e3e5e3230ae2f639c9d129851498f4 Mon Sep 17 00:00:00 2001 From: agamjots Date: Tue, 22 Jul 2025 12:38:44 -0700 Subject: [PATCH 02/33] initial setup --- docs/source/en/model_doc/imagegpt.md | 5 ++ .../models/auto/image_processing_auto.py | 2 +- src/transformers/models/imagegpt/__init__.py | 1 + .../image_processing_imagegpt_fast.py | 48 ++++++++++++++++++- .../test_image_processing_imagegpt.py | 6 ++- 5 files changed, 59 insertions(+), 3 deletions(-) diff --git a/docs/source/en/model_doc/imagegpt.md b/docs/source/en/model_doc/imagegpt.md index 7fbec62d30bb..ac6b664d439c 100644 --- a/docs/source/en/model_doc/imagegpt.md +++ b/docs/source/en/model_doc/imagegpt.md @@ -103,6 +103,11 @@ If you're interested in submitting a resource to be included here, please feel f [[autodoc]] ImageGPTImageProcessor - preprocess +## ImageGPTImageProcessorFast + +[[autodoc]] ImageGPTImageProcessorFast + - preprocess + ## ImageGPTModel [[autodoc]] ImageGPTModel diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 775d94b25b91..cb664bb1040e 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -104,7 +104,7 @@ ("idefics2", ("Idefics2ImageProcessor", "Idefics2ImageProcessorFast")), ("idefics3", ("Idefics3ImageProcessor", "Idefics3ImageProcessorFast")), ("ijepa", ("ViTImageProcessor", "ViTImageProcessorFast")), - ("imagegpt", ("ImageGPTImageProcessor",)), + ("imagegpt", ("ImageGPTImageProcessor", "ImageGPTImageProcessorFast")), ("instructblip", ("BlipImageProcessor", "BlipImageProcessorFast")), ("instructblipvideo", ("InstructBlipVideoImageProcessor",)), ("janus", ("JanusImageProcessor")), diff --git a/src/transformers/models/imagegpt/__init__.py b/src/transformers/models/imagegpt/__init__.py index cb79cea50d6e..098ffb6296f5 100644 --- a/src/transformers/models/imagegpt/__init__.py +++ b/src/transformers/models/imagegpt/__init__.py @@ -21,6 +21,7 @@ from .configuration_imagegpt import * from .feature_extraction_imagegpt import * from .image_processing_imagegpt import * + from .image_processing_imagegpt_fast import * from .modeling_imagegpt import * else: import sys diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py index c23659b3d5a2..8b823e5675a9 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py @@ -1 +1,47 @@ -#Will add fast image processing for imagegpt \ No newline at end of file +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for ImageGPT.""" + +from ...image_processing_utils_fast import BaseImageProcessorFast +from ...image_utils import PILImageResampling +from ...utils import auto_docstring + + +@auto_docstring +class ImageGPTImageProcessorFast(BaseImageProcessorFast): + # This generated class can be used as a starting point for the fast image processor. + # if the image processor is only used for simple augmentations, such as resizing, center cropping, rescaling, or normalizing, + # only the default values should be set in the class. + # If the image processor requires more complex augmentations, methods from BaseImageProcessorFast can be overridden. + # In most cases, only the `_preprocess` method should be overridden. + + # For an example of a fast image processor requiring more complex augmentations, see `LlavaNextImageProcessorFast`. + + # Default values should be checked against the slow image processor + # None values left after checking can be removed + resample = PILImageResampling.BILINEAR + image_mean = None + image_std = None + size = {"height": 256, "width": 256} + default_to_square = None + crop_size = None + do_resize = True + do_center_crop = None + do_rescale = None + do_normalize = True + do_convert_rgb = None + + +__all__ = ["ImageGPTImageProcessorFast"] diff --git a/tests/models/imagegpt/test_image_processing_imagegpt.py b/tests/models/imagegpt/test_image_processing_imagegpt.py index de29b8e29fbd..db26154a562c 100644 --- a/tests/models/imagegpt/test_image_processing_imagegpt.py +++ b/tests/models/imagegpt/test_image_processing_imagegpt.py @@ -23,7 +23,7 @@ from transformers import AutoImageProcessor from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_vision, slow -from transformers.utils import is_torch_available, is_vision_available +from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs @@ -36,6 +36,9 @@ from transformers import ImageGPTImageProcessor + if is_torchvision_available(): + from transformers import ImageGPTImageProcessorFast + class ImageGPTImageProcessingTester: def __init__( @@ -94,6 +97,7 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F @require_vision class ImageGPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): image_processing_class = ImageGPTImageProcessor if is_vision_available() else None + fast_image_processing_class = ImageGPTImageProcessorFast if is_torchvision_available() else None def setUp(self): super().setUp() From aead2217fbe9559f60ec552f4ec0bc4eaabc1c6b Mon Sep 17 00:00:00 2001 From: agamjots Date: Fri, 25 Jul 2025 00:42:38 -0700 Subject: [PATCH 03/33] Overiding imageGPT specific functions --- .../imagegpt/image_processing_imagegpt_fast.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py index 8b823e5675a9..69ca7ff3447b 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py @@ -14,11 +14,26 @@ # limitations under the License. """Fast Image processor class for ImageGPT.""" +import numpy as np +from typing import Dict, List, Optional, Tuple, Union + from ...image_processing_utils_fast import BaseImageProcessorFast from ...image_utils import PILImageResampling from ...utils import auto_docstring +def squared_euclidean_distance_fast(a, b): + b = b.T + a2 = torch.sum(a ** 2, dim = 1) + b2 = torch.sum(b ** 2, dim = 0) + ab = torch.matmul(a, b) + d = a2[:, None] - 2 * ab + b2[None, :] + return d + +def color_quantize_fast(x, clusters): + x = x.reshape(-1, 3) + d = squared_euclidean_distance_fast(x, clusters) + return np.argmin(d, axis=1) @auto_docstring class ImageGPTImageProcessorFast(BaseImageProcessorFast): # This generated class can be used as a starting point for the fast image processor. From dc1b1910fbc5885dd367015d28dedf68f234c1b8 Mon Sep 17 00:00:00 2001 From: Ethan Ayaay Date: Mon, 28 Jul 2025 11:57:58 -0700 Subject: [PATCH 04/33] imported is_torch_available and utilized it for importing torch in imageGPT fast --- .../image_processing_imagegpt_fast.py | 35 +++++++++++++------ 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py index 69ca7ff3447b..e3c94fcd6fc4 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py @@ -19,7 +19,13 @@ from ...image_processing_utils_fast import BaseImageProcessorFast from ...image_utils import PILImageResampling -from ...utils import auto_docstring +from ...utils import ( + auto_docstring, + is_torch_available +) + +if is_torch_available(): + import torch def squared_euclidean_distance_fast(a, b): b = b.T @@ -29,11 +35,11 @@ def squared_euclidean_distance_fast(a, b): d = a2[:, None] - 2 * ab + b2[None, :] return d - def color_quantize_fast(x, clusters): x = x.reshape(-1, 3) d = squared_euclidean_distance_fast(x, clusters) return np.argmin(d, axis=1) + @auto_docstring class ImageGPTImageProcessorFast(BaseImageProcessorFast): # This generated class can be used as a starting point for the fast image processor. @@ -47,16 +53,25 @@ class ImageGPTImageProcessorFast(BaseImageProcessorFast): # Default values should be checked against the slow image processor # None values left after checking can be removed resample = PILImageResampling.BILINEAR - image_mean = None - image_std = None - size = {"height": 256, "width": 256} - default_to_square = None - crop_size = None + size = {"height": 256, "width": 256} # import get_size_dict? do_resize = True - do_center_crop = None - do_rescale = None do_normalize = True - do_convert_rgb = None + # need: + # clusters, resample, do_color_quantize + + # initialize these arguments, pass it into super constructor + + # not in base: + image_mean = None # not in base, normalize uses a constant factor to divide pixel values + image_std = None # not in base, normalize uses a constant factor to divide pixel values + default_to_square = None # not in base + crop_size = None # not in base + do_center_crop = None # not in base + do_rescale = None # not in base + do_convert_rgb = None # not in base + +# preprocessor has additional kwargs: + # images, return_tensors, data_format, input_data_format __all__ = ["ImageGPTImageProcessorFast"] From daedee9aa6327966f4635f5645aec1a663229a18 Mon Sep 17 00:00:00 2001 From: Ethan Ayaay Date: Mon, 28 Jul 2025 12:26:16 -0700 Subject: [PATCH 05/33] Created init and ImageGPTFastImageProcessorKwargs --- .../image_processing_imagegpt_fast.py | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py index e3c94fcd6fc4..938df44936b5 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py @@ -17,7 +17,11 @@ import numpy as np from typing import Dict, List, Optional, Tuple, Union -from ...image_processing_utils_fast import BaseImageProcessorFast +from ...image_processing_utils_fast import ( + BaseImageProcessorFast, + DefaultFastImageProcessorKwargs +) +from ...processing_utils import Unpack from ...image_utils import PILImageResampling from ...utils import ( auto_docstring, @@ -40,6 +44,11 @@ def color_quantize_fast(x, clusters): d = squared_euclidean_distance_fast(x, clusters) return np.argmin(d, axis=1) +class ImageGPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): + do_color_quantize: Optional[bool] = True + clusters: Optional[np.ndarray] = None + resample: Optional[PILImageResampling] = PILImageResampling.BILINEAR + @auto_docstring class ImageGPTImageProcessorFast(BaseImageProcessorFast): # This generated class can be used as a starting point for the fast image processor. @@ -56,10 +65,15 @@ class ImageGPTImageProcessorFast(BaseImageProcessorFast): size = {"height": 256, "width": 256} # import get_size_dict? do_resize = True do_normalize = True - # need: - # clusters, resample, do_color_quantize + + # Specific Kwargs + do_color_quantize = True + clusters = None + resample = PILImageResampling.BILINEAR # initialize these arguments, pass it into super constructor + def __init__(self, **kwargs: Unpack[ImageGPTFastImageProcessorKwargs]): + super().__init__(**kwargs) # not in base: image_mean = None # not in base, normalize uses a constant factor to divide pixel values From 8608e19e8b995149e5794948dee62226782e2671 Mon Sep 17 00:00:00 2001 From: Ethan Ayaay Date: Mon, 28 Jul 2025 12:36:47 -0700 Subject: [PATCH 06/33] added return_tensors, data_format, and input_data_format to ImageGPTFastImageProcessorKwargs --- .../image_processing_imagegpt_fast.py | 35 ++++++++++++++----- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py index 938df44936b5..344521438b20 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py @@ -19,13 +19,19 @@ from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs + DefaultFastImageProcessorKwargs, + BatchFeature ) from ...processing_utils import Unpack -from ...image_utils import PILImageResampling +from ...image_utils import ( + PILImageResampling, + ImageInput, + ChannelDimension +) from ...utils import ( auto_docstring, - is_torch_available + is_torch_available, + TensorType ) if is_torch_available(): @@ -48,6 +54,9 @@ class ImageGPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): do_color_quantize: Optional[bool] = True clusters: Optional[np.ndarray] = None resample: Optional[PILImageResampling] = PILImageResampling.BILINEAR + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, @auto_docstring class ImageGPTImageProcessorFast(BaseImageProcessorFast): @@ -71,10 +80,6 @@ class ImageGPTImageProcessorFast(BaseImageProcessorFast): clusters = None resample = PILImageResampling.BILINEAR - # initialize these arguments, pass it into super constructor - def __init__(self, **kwargs: Unpack[ImageGPTFastImageProcessorKwargs]): - super().__init__(**kwargs) - # not in base: image_mean = None # not in base, normalize uses a constant factor to divide pixel values image_std = None # not in base, normalize uses a constant factor to divide pixel values @@ -84,8 +89,20 @@ def __init__(self, **kwargs: Unpack[ImageGPTFastImageProcessorKwargs]): do_rescale = None # not in base do_convert_rgb = None # not in base -# preprocessor has additional kwargs: - # images, return_tensors, data_format, input_data_format + # initialize these arguments, pass it into super constructor + def __init__(self, **kwargs: Unpack[ImageGPTFastImageProcessorKwargs]): + super().__init__(**kwargs) + + # _preprocessor has additional kwargs: + # images, return_tensors, data_format, input_data_format + + # PUBLIC preprocess: + def preprocess(self, images: ImageInput, **kwargs: Unpack[ImageGPTFastImageProcessorKwargs]) -> BatchFeature: + return super().preprocess(images, **kwargs) + # PRIVATE preprocess: + def _preprocess(self): + # TODO: Override + pass __all__ = ["ImageGPTImageProcessorFast"] From b772356022d71ab2f771ee39e1a301d6d2b3ded4 Mon Sep 17 00:00:00 2001 From: Ethan Ayaay Date: Mon, 28 Jul 2025 12:45:05 -0700 Subject: [PATCH 07/33] set up arguments and process and _preprocess definitions --- .../models/imagegpt/image_processing_imagegpt_fast.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py index 344521438b20..88f852b2d03c 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py @@ -56,7 +56,7 @@ class ImageGPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): resample: Optional[PILImageResampling] = PILImageResampling.BILINEAR return_tensors: Optional[Union[str, TensorType]] = None, data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST, - input_data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None @auto_docstring class ImageGPTImageProcessorFast(BaseImageProcessorFast): @@ -80,7 +80,7 @@ class ImageGPTImageProcessorFast(BaseImageProcessorFast): clusters = None resample = PILImageResampling.BILINEAR - # not in base: + # not in base ########## image_mean = None # not in base, normalize uses a constant factor to divide pixel values image_std = None # not in base, normalize uses a constant factor to divide pixel values default_to_square = None # not in base @@ -88,6 +88,7 @@ class ImageGPTImageProcessorFast(BaseImageProcessorFast): do_center_crop = None # not in base do_rescale = None # not in base do_convert_rgb = None # not in base + ############ # initialize these arguments, pass it into super constructor def __init__(self, **kwargs: Unpack[ImageGPTFastImageProcessorKwargs]): From 9e80e0ac0aa849da534cb99c6c254f2565361311 Mon Sep 17 00:00:00 2001 From: chris Date: Fri, 1 Aug 2025 11:38:11 -0700 Subject: [PATCH 08/33] Added arguments to _preprocess --- .../image_processing_imagegpt_fast.py | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py index 88f852b2d03c..3297f5fb4954 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Fast Image processor class for ImageGPT.""" - +import PIL import numpy as np from typing import Dict, List, Optional, Tuple, Union @@ -24,9 +24,9 @@ ) from ...processing_utils import Unpack from ...image_utils import ( - PILImageResampling, - ImageInput, - ChannelDimension + PILImageResampling, + ImageInput, + ChannelDimension, SizeDict ) from ...utils import ( auto_docstring, @@ -71,7 +71,7 @@ class ImageGPTImageProcessorFast(BaseImageProcessorFast): # Default values should be checked against the slow image processor # None values left after checking can be removed resample = PILImageResampling.BILINEAR - size = {"height": 256, "width": 256} # import get_size_dict? + size = {"height": 256, "width": 256} # import get_size_dict?, can be overridden in preprocess do_resize = True do_normalize = True @@ -102,8 +102,20 @@ def preprocess(self, images: ImageInput, **kwargs: Unpack[ImageGPTFastImageProce return super().preprocess(images, **kwargs) # PRIVATE preprocess: - def _preprocess(self): + def _preprocess( + self, + images: list["torch.Tensor"], + do_resize: bool, + size: SizeDict, + do_normalize: bool, + return_tensors: Optional[Union[str, TensorType]], + **kwargs, + ) -> BatchFeature: # TODO: Override + # Resize to specific size + # Normalize pixel values + # Optionally color quantize into clusters + # Return processed images in a specified tensor format pass __all__ = ["ImageGPTImageProcessorFast"] From f9f3ad8cd057585e372cede7376251f6f7d194f4 Mon Sep 17 00:00:00 2001 From: chris Date: Fri, 1 Aug 2025 12:23:15 -0700 Subject: [PATCH 09/33] Added additional optional arguments --- .../image_processing_imagegpt_fast.py | 46 +++++++++++++++++-- 1 file changed, 41 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py index 3297f5fb4954..0c625a358304 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py @@ -17,16 +17,19 @@ import numpy as np from typing import Dict, List, Optional, Tuple, Union +from ..mllama.image_processing_mllama import to_channel_dimension_format +from ...image_processing_utils import get_size_dict from ...image_processing_utils_fast import ( - BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, - BatchFeature + BaseImageProcessorFast, + DefaultFastImageProcessorKwargs, + BatchFeature, logger ) from ...processing_utils import Unpack from ...image_utils import ( PILImageResampling, ImageInput, - ChannelDimension, SizeDict + ChannelDimension, SizeDict, make_list_of_images, valid_images, validate_preprocess_arguments, is_scaled_image, + infer_channel_dimension_format ) from ...utils import ( auto_docstring, @@ -107,15 +110,48 @@ def _preprocess( images: list["torch.Tensor"], do_resize: bool, size: SizeDict, + resample: PILImageResampling, do_normalize: bool, + do_color_quantize: Optional[bool], + clusters: Optional[Union[list[list[int]], np.ndarray]], return_tensors: Optional[Union[str, TensorType]], + data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, **kwargs, ) -> BatchFeature: + # TODO: Override # Resize to specific size # Normalize pixel values # Optionally color quantize into clusters # Return processed images in a specified tensor format - pass + + do_resize = do_resize if do_resize is not None else self.do_resize + size = size if size is not None else self.size + resample = resample if resample is not None else self.resample + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + do_color_quantize = do_color_quantize if do_color_quantize is not None else self.do_color_quantize + clusters = clusters if clusters is not None else self.clusters + clusters = np.array(clusters) + + images = make_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + # Here, normalize() is using a constant factor to divide pixel values. + # hence, the method does not need iamge_mean and image_std. + validate_preprocess_arguments( + do_resize=do_resize, + size=size, + resample=resample, + ) + + if do_color_quantize and clusters is None: + raise ValueError("Clusters must be specified if do_color_quantize is True.") + __all__ = ["ImageGPTImageProcessorFast"] From 870cd9ade501dd06183bda2259ece381eb17158f Mon Sep 17 00:00:00 2001 From: Ethan Ayaay Date: Fri, 1 Aug 2025 13:01:06 -0700 Subject: [PATCH 10/33] Copied logic over from base imageGPT processor --- .../image_processing_imagegpt_fast.py | 65 ++++++++++++++++--- 1 file changed, 55 insertions(+), 10 deletions(-) diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py index 0c625a358304..5993a0e2bcbd 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py @@ -17,8 +17,8 @@ import numpy as np from typing import Dict, List, Optional, Tuple, Union -from ..mllama.image_processing_mllama import to_channel_dimension_format -from ...image_processing_utils import get_size_dict +from transformers.image_transforms import to_channel_dimension_format + from ...image_processing_utils_fast import ( BaseImageProcessorFast, DefaultFastImageProcessorKwargs, @@ -28,8 +28,8 @@ from ...image_utils import ( PILImageResampling, ImageInput, - ChannelDimension, SizeDict, make_list_of_images, valid_images, validate_preprocess_arguments, is_scaled_image, - infer_channel_dimension_format + ChannelDimension, SizeDict, + infer_channel_dimension_format, make_list_of_images, valid_images, validate_preprocess_arguments, is_scaled_image, ) from ...utils import ( auto_docstring, @@ -54,6 +54,7 @@ def color_quantize_fast(x, clusters): return np.argmin(d, axis=1) class ImageGPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): + # TODO: Add documentation for each argument do_color_quantize: Optional[bool] = True clusters: Optional[np.ndarray] = None resample: Optional[PILImageResampling] = PILImageResampling.BILINEAR @@ -120,12 +121,6 @@ def _preprocess( **kwargs, ) -> BatchFeature: - # TODO: Override - # Resize to specific size - # Normalize pixel values - # Optionally color quantize into clusters - # Return processed images in a specified tensor format - do_resize = do_resize if do_resize is not None else self.do_resize size = size if size is not None else self.size resample = resample if resample is not None else self.resample @@ -154,4 +149,54 @@ def _preprocess( raise ValueError("Clusters must be specified if do_color_quantize is True.") + # TODO: + + # Resize to specific size + + # Normalize pixel values + + # Optionally color quantize into clusters + + # Return processed images in a specified tensor format + + if do_normalize and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If you wish to do this, " + "make sure to set `do_normalize` to `False` and that pixel values are between [-1, 1].", + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + if do_resize: + images = [ + self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) + for image in images + ] + + if do_normalize: + images = [self.normalize(image=image, input_data_format=input_data_format) for image in images] + + if do_color_quantize: + images = [to_channel_dimension_format(image, ChannelDimension.LAST, input_data_format) for image in images] + # color quantize from (batch_size, height, width, 3) to (batch_size, height, width) + images = np.array(images) + images = color_quantize_fast(images, clusters).reshape(images.shape[:-1]) + + # flatten to (batch_size, height*width) + batch_size = images.shape[0] + images = images.reshape(batch_size, -1) + + # We need to convert back to a list of images to keep consistent behaviour across processors. + images = list(images) + else: + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + for image in images + ] + + data = {"input_ids": images} + return BatchFeature(data=data, tensor_type=return_tensors) + __all__ = ["ImageGPTImageProcessorFast"] From 3604c7a15908f722eb2bf58f725840816c33d424 Mon Sep 17 00:00:00 2001 From: Ethan Ayaay Date: Tue, 5 Aug 2025 12:58:09 -0700 Subject: [PATCH 11/33] Implemented 2nd draft of fast imageGPT preprocess using batch processing --- .../image_processing_imagegpt_fast.py | 122 +++++++++--------- 1 file changed, 59 insertions(+), 63 deletions(-) diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py index 5993a0e2bcbd..7649995b5fd1 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py @@ -22,7 +22,8 @@ from ...image_processing_utils_fast import ( BaseImageProcessorFast, DefaultFastImageProcessorKwargs, - BatchFeature, logger + BatchFeature, logger, + group_images_by_shape, reorder_images ) from ...processing_utils import Unpack from ...image_utils import ( @@ -111,92 +112,87 @@ def _preprocess( images: list["torch.Tensor"], do_resize: bool, size: SizeDict, + interpolation: Optional["F.InterpolationMode"], resample: PILImageResampling, do_normalize: bool, do_color_quantize: Optional[bool], clusters: Optional[Union[list[list[int]], np.ndarray]], return_tensors: Optional[Union[str, TensorType]], + disable_grouping: Optional[bool], data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, **kwargs, ) -> BatchFeature: - do_resize = do_resize if do_resize is not None else self.do_resize size = size if size is not None else self.size resample = resample if resample is not None else self.resample do_normalize = do_normalize if do_normalize is not None else self.do_normalize do_color_quantize = do_color_quantize if do_color_quantize is not None else self.do_color_quantize clusters = clusters if clusters is not None else self.clusters - clusters = np.array(clusters) - - images = make_list_of_images(images) - - if not valid_images(images): - raise ValueError( - "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " - "torch.Tensor, tf.Tensor or jax.ndarray." - ) - - # Here, normalize() is using a constant factor to divide pixel values. - # hence, the method does not need iamge_mean and image_std. - validate_preprocess_arguments( - do_resize=do_resize, - size=size, - resample=resample, - ) + # 1. Setup. Validate ImageGPT-specific requirements + # Check for do_color_quantize and clusters. if do_color_quantize and clusters is None: raise ValueError("Clusters must be specified if do_color_quantize is True.") + # Clusters come in np arrays. Convert to torch tensors. + if clusters is not None: + cluster_tensors = torch.tensor(clusters, dtype=torch.float32) + if images[0].is_cuda: + # if image is stored on a CUDA GPA, convert tensors to CUDA + cluster_tensors = cluster_tensors.cuda() + + # 2. Group images into batches of the same shape for more efficient processing. + grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) + unordered_processed_images = {} + + # Loop through shapes and stacked images + for shape, stacked_images in grouped_images.items(): + # Resize to specific sizes (if do_resize is specified) + if do_resize: + stacked_images = self.resize( + image=stacked_images, + size=size, + interpolation=interpolation + ) + + # Normalize pixel values (if do_normalize is specified) + if do_normalize: + stacked_images = stacked_images.float() + stacked_images = (stacked_images / 127.5) - 1.0 + + unordered_processed_images[shape] = stacked_images + + # 3. Reorder and maintain original image order after processing into batches + processed_images = reorder_images(unordered_processed_images, grouped_images_index) + + # 4. Color quantize if specified + if do_color_quantize: + quantized_images = [] + for image in processed_images: + # Convert CHW to HWC for quantization + image_hwc = image.permute(1, 2, 0) # (H, W, C) - # TODO: - - # Resize to specific size - - # Normalize pixel values - - # Optionally color quantize into clusters - - # Return processed images in a specified tensor format - - if do_normalize and is_scaled_image(images[0]): - logger.warning_once( - "It looks like you are trying to rescale already rescaled images. If you wish to do this, " - "make sure to set `do_normalize` to `False` and that pixel values are between [-1, 1].", - ) - - if input_data_format is None: - # We assume that all images have the same channel dimension format. - input_data_format = infer_channel_dimension_format(images[0]) - - if do_resize: - images = [ - self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) - for image in images - ] + # Denormalize back to [0, 255] for quantization + image_hwc = (image_hwc + 1.0) * 127.5 + image_hwc = torch.clamp(image_hwc, 0, 255) - if do_normalize: - images = [self.normalize(image=image, input_data_format=input_data_format) for image in images] + # Fast torch-based color quantization + quantized = self._color_quantize_torch(image_hwc, cluster_tensors) - if do_color_quantize: - images = [to_channel_dimension_format(image, ChannelDimension.LAST, input_data_format) for image in images] - # color quantize from (batch_size, height, width, 3) to (batch_size, height, width) - images = np.array(images) - images = color_quantize_fast(images, clusters).reshape(images.shape[:-1]) + # Flatten to sequence (H*W,) + quantized_flat = quantized.view(-1) + quantized_images.append(quantized_flat) - # flatten to (batch_size, height*width) - batch_size = images.shape[0] - images = images.reshape(batch_size, -1) + # Stack all quantized sequences + input_ids = torch.stack(quantized_images, dim=0) - # We need to convert back to a list of images to keep consistent behaviour across processors. - images = list(images) + return BatchFeature(data={"input_ids": input_ids}, tensor_type=return_tensors) else: - images = [ - to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) - for image in images - ] - - data = {"input_ids": images} - return BatchFeature(data=data, tensor_type=return_tensors) + # 5. Standard output without quantizing + pixel_values = torch.stack(processed_images, dim=0) + return BatchFeature(data={"pixel_values": pixel_values}, tensor_type=return_tensors) __all__ = ["ImageGPTImageProcessorFast"] From fd5c1362b2d47471431c9b9e74f6eb6d66445a84 Mon Sep 17 00:00:00 2001 From: Ethan Ayaay Date: Tue, 5 Aug 2025 13:37:30 -0700 Subject: [PATCH 12/33] Implemented 3rd draft of imageGPT fast _preprocessor. Pulled logic from BaseImageProcessorFast --- .../image_processing_imagegpt_fast.py | 70 +++++++++++++------ 1 file changed, 48 insertions(+), 22 deletions(-) diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py index 7649995b5fd1..abd548a5a2ab 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py @@ -35,11 +35,14 @@ from ...utils import ( auto_docstring, is_torch_available, + is_torchvision_available, TensorType ) if is_torch_available(): import torch + if is_torchvision_available(): + from torchvision.transforms import functional as F def squared_euclidean_distance_fast(a, b): b = b.T @@ -52,7 +55,7 @@ def squared_euclidean_distance_fast(a, b): def color_quantize_fast(x, clusters): x = x.reshape(-1, 3) d = squared_euclidean_distance_fast(x, clusters) - return np.argmin(d, axis=1) + return torch.argmin(d, dim=1) class ImageGPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): # TODO: Add documentation for each argument @@ -86,8 +89,8 @@ class ImageGPTImageProcessorFast(BaseImageProcessorFast): resample = PILImageResampling.BILINEAR # not in base ########## - image_mean = None # not in base, normalize uses a constant factor to divide pixel values - image_std = None # not in base, normalize uses a constant factor to divide pixel values + image_mean = [0.5, 0.5, 0.5] # not in base, normalize uses a constant factor to divide pixel values + image_std = [0.5, 0.5, 0.5] # not in base, normalize uses a constant factor to divide pixel values default_to_square = None # not in base crop_size = None # not in base do_center_crop = None # not in base @@ -113,16 +116,20 @@ def _preprocess( do_resize: bool, size: SizeDict, interpolation: Optional["F.InterpolationMode"], - resample: PILImageResampling, + do_center_crop: bool, + crop_size: SizeDict, + do_rescale: bool, + rescale_factor: float, do_normalize: bool, - do_color_quantize: Optional[bool], - clusters: Optional[Union[list[list[int]], np.ndarray]], - return_tensors: Optional[Union[str, TensorType]], - disable_grouping: Optional[bool], - data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST, - input_data_format: Optional[Union[str, ChannelDimension]] = None, image_mean: Optional[Union[float, list[float]]] = None, image_std: Optional[Union[float, list[float]]] = None, + disable_grouping: Optional[bool] = False, + return_tensors: Optional[Union[str, TensorType]] = None, + resample: Optional[PILImageResampling] = None, + do_color_quantize: Optional[bool] = None, + clusters: Optional[Union[list[list[int]], np.ndarray]] = None, + data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, **kwargs, ) -> BatchFeature: do_resize = do_resize if do_resize is not None else self.do_resize @@ -131,6 +138,8 @@ def _preprocess( do_normalize = do_normalize if do_normalize is not None else self.do_normalize do_color_quantize = do_color_quantize if do_color_quantize is not None else self.do_color_quantize clusters = clusters if clusters is not None else self.clusters + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std # 1. Setup. Validate ImageGPT-specific requirements # Check for do_color_quantize and clusters. @@ -138,16 +147,16 @@ def _preprocess( raise ValueError("Clusters must be specified if do_color_quantize is True.") # Clusters come in np arrays. Convert to torch tensors. + cluster_tensors = None if clusters is not None: cluster_tensors = torch.tensor(clusters, dtype=torch.float32) - if images[0].is_cuda: - # if image is stored on a CUDA GPA, convert tensors to CUDA - cluster_tensors = cluster_tensors.cuda() + if len(images) > 0 and images[0].is_cuda: + # if image is stored on a CUDA GPA, convert tensors to CUDA + cluster_tensors = cluster_tensors.cuda() # 2. Group images into batches of the same shape for more efficient processing. grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) - unordered_processed_images = {} - + resized_images_grouped = {} # Loop through shapes and stacked images for shape, stacked_images in grouped_images.items(): # Resize to specific sizes (if do_resize is specified) @@ -157,16 +166,33 @@ def _preprocess( size=size, interpolation=interpolation ) - # Normalize pixel values (if do_normalize is specified) if do_normalize: - stacked_images = stacked_images.float() - stacked_images = (stacked_images / 127.5) - 1.0 - - unordered_processed_images[shape] = stacked_images + stacked_images = self.normalize( + image=stacked_images, + mean=image_mean, + std=image_std + ) + resized_images_grouped[shape] = stacked_images # 3. Reorder and maintain original image order after processing into batches - processed_images = reorder_images(unordered_processed_images, grouped_images_index) + resized_images = reorder_images(resized_images_grouped, grouped_images_index) + + # Group images by size for further processing + # Needed in case do_resize is False, or resize returns images with different sizes + grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping) + processed_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_center_crop: + stacked_images = self.center_crop(stacked_images, crop_size) + # Fused rescale and normalize + stacked_images = self.rescale_and_normalize( + stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std + ) + processed_images_grouped[shape] = stacked_images + + processed_images = reorder_images(processed_images_grouped, grouped_images_index) + processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images # 4. Color quantize if specified if do_color_quantize: @@ -180,7 +206,7 @@ def _preprocess( image_hwc = torch.clamp(image_hwc, 0, 255) # Fast torch-based color quantization - quantized = self._color_quantize_torch(image_hwc, cluster_tensors) + quantized = color_quantize_fast(image_hwc, cluster_tensors) # Flatten to sequence (H*W,) quantized_flat = quantized.view(-1) From ec1681bca5100d18f117962d70bb99325e27b346 Mon Sep 17 00:00:00 2001 From: Ethan Ayaay Date: Thu, 7 Aug 2025 11:41:35 -0700 Subject: [PATCH 13/33] modified imageGPT test file to properly run fast processor tests --- .../models/imagegpt/test_image_processing_imagegpt.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/models/imagegpt/test_image_processing_imagegpt.py b/tests/models/imagegpt/test_image_processing_imagegpt.py index db26154a562c..154440c2f134 100644 --- a/tests/models/imagegpt/test_image_processing_imagegpt.py +++ b/tests/models/imagegpt/test_image_processing_imagegpt.py @@ -108,11 +108,12 @@ def image_processor_dict(self): return self.image_processor_tester.prepare_image_processor_dict() def test_image_processor_properties(self): - image_processing = self.image_processing_class(**self.image_processor_dict) - self.assertTrue(hasattr(image_processing, "clusters")) - self.assertTrue(hasattr(image_processing, "do_resize")) - self.assertTrue(hasattr(image_processing, "size")) - self.assertTrue(hasattr(image_processing, "do_normalize")) + for image_processing_class in self.image_processors_list: + image_processing = image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "clusters")) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "do_normalize")) def test_image_processor_from_dict_with_kwargs(self): image_processor = self.image_processing_class.from_dict(self.image_processor_dict) From 432e8f3ba7804867af5f00460504f09abda10a05 Mon Sep 17 00:00:00 2001 From: Ethan Ayaay Date: Thu, 7 Aug 2025 12:19:53 -0700 Subject: [PATCH 14/33] converts images to torch.float32 from torch.unit8 --- .../models/imagegpt/image_processing_imagegpt_fast.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py index abd548a5a2ab..78819fe2fa12 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py @@ -125,7 +125,7 @@ def _preprocess( image_std: Optional[Union[float, list[float]]] = None, disable_grouping: Optional[bool] = False, return_tensors: Optional[Union[str, TensorType]] = None, - resample: Optional[PILImageResampling] = None, + resample: Optional[PILImageResampling] = PILImageResampling.BILINEAR, do_color_quantize: Optional[bool] = None, clusters: Optional[Union[list[list[int]], np.ndarray]] = None, data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST, @@ -146,6 +146,9 @@ def _preprocess( if do_color_quantize and clusters is None: raise ValueError("Clusters must be specified if do_color_quantize is True.") + # Convert images to torch float32 + images = [image.to(torch.float32) for image in images] + # Clusters come in np arrays. Convert to torch tensors. cluster_tensors = None if clusters is not None: From 040678c87153be28c278bf2c767c5519bc8b0568 Mon Sep 17 00:00:00 2001 From: Ethan Ayaay Date: Thu, 7 Aug 2025 12:34:46 -0700 Subject: [PATCH 15/33] fixed a typo with self.image_processor_list in the imagegpt test file --- tests/models/imagegpt/test_image_processing_imagegpt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/imagegpt/test_image_processing_imagegpt.py b/tests/models/imagegpt/test_image_processing_imagegpt.py index 154440c2f134..0936524cfa05 100644 --- a/tests/models/imagegpt/test_image_processing_imagegpt.py +++ b/tests/models/imagegpt/test_image_processing_imagegpt.py @@ -108,7 +108,7 @@ def image_processor_dict(self): return self.image_processor_tester.prepare_image_processor_dict() def test_image_processor_properties(self): - for image_processing_class in self.image_processors_list: + for image_processing_class in self.image_processor_list: image_processing = image_processing_class(**self.image_processor_dict) self.assertTrue(hasattr(image_processing, "clusters")) self.assertTrue(hasattr(image_processing, "do_resize")) From 6e0c6703a3094a0487f5abfbfd846fd9b4f0eeb5 Mon Sep 17 00:00:00 2001 From: Ethan Ayaay Date: Thu, 7 Aug 2025 12:38:35 -0700 Subject: [PATCH 16/33] updated more instances of image_processing = self.image_processing_class in the test file to test fast processor --- .../test_image_processing_imagegpt.py | 109 +++++++++--------- 1 file changed, 56 insertions(+), 53 deletions(-) diff --git a/tests/models/imagegpt/test_image_processing_imagegpt.py b/tests/models/imagegpt/test_image_processing_imagegpt.py index 0936524cfa05..56eaf32895e8 100644 --- a/tests/models/imagegpt/test_image_processing_imagegpt.py +++ b/tests/models/imagegpt/test_image_processing_imagegpt.py @@ -186,43 +186,45 @@ def test_init_without_params(self): # Override the test from ImageProcessingTestMixin as ImageGPT model takes input_ids as input def test_call_pil(self): - # Initialize image_processing - image_processing = self.image_processing_class(**self.image_processor_dict) - # create random PIL images - image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False) - for image in image_inputs: - self.assertIsInstance(image, Image.Image) - - # Test not batched input - encoded_images = image_processing(image_inputs[0], return_tensors="pt").input_ids - expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(encoded_images) - self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) - - # Test batched - encoded_images = image_processing(image_inputs, return_tensors="pt").input_ids - self.assertEqual( - tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape) - ) + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = image_processing_class(**self.image_processor_dict) + # create random PIL images + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False) + for image in image_inputs: + self.assertIsInstance(image, Image.Image) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").input_ids + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(encoded_images) + self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").input_ids + self.assertEqual( + tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape) + ) # Override the test from ImageProcessingTestMixin as ImageGPT model takes input_ids as input def test_call_numpy(self): - # Initialize image_processing - image_processing = self.image_processing_class(**self.image_processor_dict) - # create random numpy tensors - image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True) - for image in image_inputs: - self.assertIsInstance(image, np.ndarray) - - # Test not batched input - encoded_images = image_processing(image_inputs[0], return_tensors="pt").input_ids - expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(encoded_images) - self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) - - # Test batched - encoded_images = image_processing(image_inputs, return_tensors="pt").input_ids - self.assertEqual( - tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape) - ) + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = image_processing_class(**self.image_processor_dict) + # create random numpy tensors + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True) + for image in image_inputs: + self.assertIsInstance(image, np.ndarray) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").input_ids + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(encoded_images) + self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").input_ids + self.assertEqual( + tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape) + ) @unittest.skip(reason="ImageGPT assumes clusters for 3 channels") def test_call_numpy_4_channels(self): @@ -230,25 +232,26 @@ def test_call_numpy_4_channels(self): # Override the test from ImageProcessingTestMixin as ImageGPT model takes input_ids as input def test_call_pytorch(self): - # Initialize image_processing - image_processing = self.image_processing_class(**self.image_processor_dict) - # create random PyTorch tensors - image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True) - expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) - - for image in image_inputs: - self.assertIsInstance(image, torch.Tensor) - - # Test not batched input - encoded_images = image_processing(image_inputs[0], return_tensors="pt").input_ids - self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) - - # Test batched - encoded_images = image_processing(image_inputs, return_tensors="pt").input_ids - self.assertEqual( - tuple(encoded_images.shape), - (self.image_processor_tester.batch_size, *expected_output_image_shape), - ) + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = image_processing_class(**self.image_processor_dict) + # create random PyTorch tensors + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True) + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) + + for image in image_inputs: + self.assertIsInstance(image, torch.Tensor) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").input_ids + self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").input_ids + self.assertEqual( + tuple(encoded_images.shape), + (self.image_processor_tester.batch_size, *expected_output_image_shape), + ) def prepare_images(): From a020d5fc6a462f889029b03246adabd093d11848 Mon Sep 17 00:00:00 2001 From: Ethan Ayaay Date: Mon, 11 Aug 2025 10:46:30 -0700 Subject: [PATCH 17/33] standardized normalization to not use image mean or std --- .../image_processing_imagegpt_fast.py | 35 +++++++------------ 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py index 78819fe2fa12..41a74ecbde98 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py @@ -59,12 +59,10 @@ def color_quantize_fast(x, clusters): class ImageGPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): # TODO: Add documentation for each argument - do_color_quantize: Optional[bool] = True clusters: Optional[np.ndarray] = None + do_color_quantize: Optional[bool] = True resample: Optional[PILImageResampling] = PILImageResampling.BILINEAR return_tensors: Optional[Union[str, TensorType]] = None, - data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST, - input_data_format: Optional[Union[str, ChannelDimension]] = None @auto_docstring class ImageGPTImageProcessorFast(BaseImageProcessorFast): @@ -81,21 +79,18 @@ class ImageGPTImageProcessorFast(BaseImageProcessorFast): resample = PILImageResampling.BILINEAR size = {"height": 256, "width": 256} # import get_size_dict?, can be overridden in preprocess do_resize = True - do_normalize = True - - # Specific Kwargs + do_normalize = False do_color_quantize = True clusters = None - resample = PILImageResampling.BILINEAR # not in base ########## image_mean = [0.5, 0.5, 0.5] # not in base, normalize uses a constant factor to divide pixel values image_std = [0.5, 0.5, 0.5] # not in base, normalize uses a constant factor to divide pixel values - default_to_square = None # not in base - crop_size = None # not in base - do_center_crop = None # not in base - do_rescale = None # not in base - do_convert_rgb = None # not in base + # default_to_square = None # not in base + # crop_size = None # not in base + # do_center_crop = None # not in base + # do_rescale = None # not in base + # do_convert_rgb = None # not in base ############ # initialize these arguments, pass it into super constructor @@ -169,13 +164,6 @@ def _preprocess( size=size, interpolation=interpolation ) - # Normalize pixel values (if do_normalize is specified) - if do_normalize: - stacked_images = self.normalize( - image=stacked_images, - mean=image_mean, - std=image_std - ) resized_images_grouped[shape] = stacked_images # 3. Reorder and maintain original image order after processing into batches @@ -188,10 +176,13 @@ def _preprocess( for shape, stacked_images in grouped_images.items(): if do_center_crop: stacked_images = self.center_crop(stacked_images, crop_size) - # Fused rescale and normalize - stacked_images = self.rescale_and_normalize( - stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std + # Fused rescale + stacked_images = self.rescale( + stacked_images, rescale_factor ) + if do_normalize: + stacked_images = (stacked_images / 127.5) - 1.0 + processed_images_grouped[shape] = stacked_images processed_images = reorder_images(processed_images_grouped, grouped_images_index) From 56b3546e2882e09253aac9795ba0a836752ce770 Mon Sep 17 00:00:00 2001 From: Ethan Ayaay Date: Mon, 11 Aug 2025 13:30:48 -0700 Subject: [PATCH 18/33] Merged changes from solution2 branch --- .../image_processing_imagegpt_fast.py | 320 ++++++++---------- 1 file changed, 143 insertions(+), 177 deletions(-) diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py index 41a74ecbde98..8d62789121f7 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py @@ -13,206 +13,172 @@ # See the License for the specific language governing permissions and # limitations under the License. """Fast Image processor class for ImageGPT.""" -import PIL + import numpy as np -from typing import Dict, List, Optional, Tuple, Union - -from transformers.image_transforms import to_channel_dimension_format - -from ...image_processing_utils_fast import ( - BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, - BatchFeature, logger, - group_images_by_shape, reorder_images -) -from ...processing_utils import Unpack -from ...image_utils import ( - PILImageResampling, - ImageInput, - ChannelDimension, SizeDict, - infer_channel_dimension_format, make_list_of_images, valid_images, validate_preprocess_arguments, is_scaled_image, -) -from ...utils import ( - auto_docstring, - is_torch_available, - is_torchvision_available, - TensorType -) - -if is_torch_available(): - import torch - if is_torchvision_available(): - from torchvision.transforms import functional as F - -def squared_euclidean_distance_fast(a, b): - b = b.T - a2 = torch.sum(a ** 2, dim = 1) - b2 = torch.sum(b ** 2, dim = 0) - ab = torch.matmul(a, b) +import torch +from typing import Optional, Union + +from ...image_processing_utils_fast import BaseImageProcessorFast +from ...image_utils import PILImageResampling +from ...utils import auto_docstring + + +def squared_euclidean_distance_torch(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor: + """ + Compute squared Euclidean distances between all pixels and clusters. + + Args: + a: (N, 3) tensor of pixel RGB values + b: (M, 3) tensor of cluster RGB values + + Returns: + (N, M) tensor of squared distances + """ + b = b.t() # (3, M) + a2 = torch.sum(a ** 2, dim=1) # (N,) + b2 = torch.sum(b ** 2, dim=0) # (M,) + ab = torch.matmul(a, b) # (N, M) d = a2[:, None] - 2 * ab + b2[None, :] return d -def color_quantize_fast(x, clusters): - x = x.reshape(-1, 3) - d = squared_euclidean_distance_fast(x, clusters) + +def color_quantize_torch(x: torch.Tensor, clusters: torch.Tensor) -> torch.Tensor: + """ + Assign each pixel to its nearest color cluster. + + Args: + x: (H*W, 3) tensor of flattened pixel RGB values + clusters: (n_clusters, 3) tensor of cluster RGB values + + Returns: + (H*W,) tensor of cluster indices + """ + d = squared_euclidean_distance_torch(x, clusters) return torch.argmin(d, dim=1) -class ImageGPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - # TODO: Add documentation for each argument - clusters: Optional[np.ndarray] = None - do_color_quantize: Optional[bool] = True - resample: Optional[PILImageResampling] = PILImageResampling.BILINEAR - return_tensors: Optional[Union[str, TensorType]] = None, @auto_docstring class ImageGPTImageProcessorFast(BaseImageProcessorFast): - # This generated class can be used as a starting point for the fast image processor. - # if the image processor is only used for simple augmentations, such as resizing, center cropping, rescaling, or normalizing, - # only the default values should be set in the class. - # If the image processor requires more complex augmentations, methods from BaseImageProcessorFast can be overridden. - # In most cases, only the `_preprocess` method should be overridden. + """ + Constructs a fast ImageGPT image processor. - # For an example of a fast image processor requiring more complex augmentations, see `LlavaNextImageProcessorFast`. + This processor can be used to resize images to a smaller resolution (such as 32x32 or 64x64), + normalize them and finally color quantize them to obtain sequences of "pixel values" (color clusters). + """ - # Default values should be checked against the slow image processor - # None values left after checking can be removed + model_input_names = ["input_ids"] + + # Defaults largely aligned with the slow processor, except normalization which we do manually to [-1, 1] resample = PILImageResampling.BILINEAR - size = {"height": 256, "width": 256} # import get_size_dict?, can be overridden in preprocess + size = {"height": 256, "width": 256} do_resize = True + # We do NOT use the base normalization/rescale as ImageGPT expects (x/127.5 - 1) + do_rescale = False do_normalize = False - do_color_quantize = True - clusters = None - - # not in base ########## - image_mean = [0.5, 0.5, 0.5] # not in base, normalize uses a constant factor to divide pixel values - image_std = [0.5, 0.5, 0.5] # not in base, normalize uses a constant factor to divide pixel values - # default_to_square = None # not in base - # crop_size = None # not in base - # do_center_crop = None # not in base - # do_rescale = None # not in base - # do_convert_rgb = None # not in base - ############ - - # initialize these arguments, pass it into super constructor - def __init__(self, **kwargs: Unpack[ImageGPTFastImageProcessorKwargs]): - super().__init__(**kwargs) - # _preprocessor has additional kwargs: - # images, return_tensors, data_format, input_data_format + do_color_quantize = True + clusters = None # Must be set at instantiation - # PUBLIC preprocess: - def preprocess(self, images: ImageInput, **kwargs: Unpack[ImageGPTFastImageProcessorKwargs]) -> BatchFeature: - return super().preprocess(images, **kwargs) + def __init__( + self, + clusters: Optional[Union[list, np.ndarray]] = None, + **kwargs, + ): + super().__init__(**kwargs) + # Store clusters as numpy for JSON serializability. Convert to torch in _preprocess when needed. + if clusters is not None: + self.clusters = np.asarray(clusters, dtype=np.float32) + else: + self.clusters = None + # Default: follow ImageGPT behavior (normalize by default). We stash here and force base to skip. + self._do_normalize_imagegpt = kwargs.get("do_normalize", True) + + def _further_process_kwargs(self, **kwargs): + # Let the base process size/crop and other standard kwargs first + kwargs = super()._further_process_kwargs(**kwargs) + if "do_normalize" in kwargs and kwargs["do_normalize"] is not None: + self._do_normalize_imagegpt = kwargs["do_normalize"] + # Force base pipeline to skip its rescale/normalize validation and logic + kwargs["do_rescale"] = False + kwargs["do_normalize"] = False + return kwargs - # PRIVATE preprocess: def _preprocess( self, - images: list["torch.Tensor"], - do_resize: bool, - size: SizeDict, - interpolation: Optional["F.InterpolationMode"], - do_center_crop: bool, - crop_size: SizeDict, - do_rescale: bool, - rescale_factor: float, - do_normalize: bool, - image_mean: Optional[Union[float, list[float]]] = None, - image_std: Optional[Union[float, list[float]]] = None, - disable_grouping: Optional[bool] = False, - return_tensors: Optional[Union[str, TensorType]] = None, - resample: Optional[PILImageResampling] = PILImageResampling.BILINEAR, + images, do_color_quantize: Optional[bool] = None, - clusters: Optional[Union[list[list[int]], np.ndarray]] = None, - data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs, - ) -> BatchFeature: - do_resize = do_resize if do_resize is not None else self.do_resize - size = size if size is not None else self.size - resample = resample if resample is not None else self.resample - do_normalize = do_normalize if do_normalize is not None else self.do_normalize - do_color_quantize = do_color_quantize if do_color_quantize is not None else self.do_color_quantize - clusters = clusters if clusters is not None else self.clusters - image_mean = image_mean if image_mean is not None else self.image_mean - image_std = image_std if image_std is not None else self.image_std - - # 1. Setup. Validate ImageGPT-specific requirements - # Check for do_color_quantize and clusters. - if do_color_quantize and clusters is None: - raise ValueError("Clusters must be specified if do_color_quantize is True.") - - # Convert images to torch float32 - images = [image.to(torch.float32) for image in images] - - # Clusters come in np arrays. Convert to torch tensors. - cluster_tensors = None - if clusters is not None: - cluster_tensors = torch.tensor(clusters, dtype=torch.float32) - if len(images) > 0 and images[0].is_cuda: - # if image is stored on a CUDA GPA, convert tensors to CUDA - cluster_tensors = cluster_tensors.cuda() - - # 2. Group images into batches of the same shape for more efficient processing. - grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) - resized_images_grouped = {} - # Loop through shapes and stacked images - for shape, stacked_images in grouped_images.items(): - # Resize to specific sizes (if do_resize is specified) - if do_resize: - stacked_images = self.resize( - image=stacked_images, - size=size, - interpolation=interpolation - ) - resized_images_grouped[shape] = stacked_images - - # 3. Reorder and maintain original image order after processing into batches - resized_images = reorder_images(resized_images_grouped, grouped_images_index) - - # Group images by size for further processing - # Needed in case do_resize is False, or resize returns images with different sizes - grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping) - processed_images_grouped = {} - for shape, stacked_images in grouped_images.items(): - if do_center_crop: - stacked_images = self.center_crop(stacked_images, crop_size) - # Fused rescale - stacked_images = self.rescale( - stacked_images, rescale_factor - ) + clusters: Optional[Union[list, np.ndarray, torch.Tensor]] = None, + return_tensors: Optional[str] = None, + **kwargs + ): + # Run standard fast pipeline (resize, crop, batching) without rescale/normalize + base_batch = super()._preprocess(images, return_tensors=return_tensors, **kwargs) + pixel_values = base_batch["pixel_values"] # Tensor [B,C,H,W] or list of [C,H,W] + + # Apply ImageGPT normalization when requested: [-1, 1] + do_normalize = getattr(self, "_do_normalize_imagegpt", True) + if isinstance(pixel_values, torch.Tensor): + normalized = pixel_values.to(dtype=torch.float32) if do_normalize: - stacked_images = (stacked_images / 127.5) - 1.0 - - processed_images_grouped[shape] = stacked_images - - processed_images = reorder_images(processed_images_grouped, grouped_images_index) - processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images + normalized = normalized / 127.5 - 1.0 + else: + normalized = [img.to(dtype=torch.float32) for img in pixel_values] + if do_normalize: + normalized = [img / 127.5 - 1.0 for img in normalized] - # 4. Color quantize if specified + # If color quantization is requested, perform it; otherwise return pixel values + do_color_quantize = do_color_quantize if do_color_quantize is not None else self.do_color_quantize if do_color_quantize: - quantized_images = [] - for image in processed_images: - # Convert CHW to HWC for quantization - image_hwc = image.permute(1, 2, 0) # (H, W, C) - - # Denormalize back to [0, 255] for quantization - image_hwc = (image_hwc + 1.0) * 127.5 - image_hwc = torch.clamp(image_hwc, 0, 255) + # Prepare clusters + clusters = clusters if clusters is not None else self.clusters + if clusters is None: + raise ValueError("Clusters must be provided for color quantization.") + clusters_torch = torch.as_tensor(clusters, dtype=torch.float32) + + # Helper for clarity: quantize a single image [C,H,W] -> [H*W] + def _quantize_one_image(image_chw: torch.Tensor, clusters_ref: torch.Tensor) -> torch.Tensor: + device_clusters = clusters_ref.to(image_chw.device, dtype=image_chw.dtype) + img_hwc = image_chw.permute(1, 2, 0) + pixels = img_hwc.reshape(-1, 3) + return color_quantize_torch(pixels, device_clusters) + + if isinstance(normalized, torch.Tensor): + images_list = [img for img in normalized] + else: + images_list = list(normalized) + + ids_list = [_quantize_one_image(img, clusters_torch) for img in images_list] + + if return_tensors == "pt": + input_ids = torch.stack(ids_list, dim=0) + pixel_values_out = torch.stack(images_list, dim=0) + else: + input_ids = ids_list + pixel_values_out = images_list + + from ...image_processing_utils import BatchFeature + return BatchFeature(data={"input_ids": input_ids, "pixel_values": pixel_values_out}, tensor_type=return_tensors) + + # Otherwise, return pixel values (normalized or not depending on flag) + base_batch["pixel_values"] = normalized + return base_batch + + def to_dict(self): + # Convert numpy arrays to lists for JSON serialization + output = super().to_dict() + if output.get("clusters") is not None and isinstance(output["clusters"], np.ndarray): + output["clusters"] = output["clusters"].tolist() + # ImageGPT does not use base mean/std normalization; keep these None for parity with slow processor + # output["image_mean"] = None + # output["image_std"] = None + # No rescaling in fast ImageGPT path + + #Need to set these valus to match with slow processor during testing + output["rescale_factor"] = None + output["do_rescale"] = None + output["do_color_quantize"] = bool(getattr(self, "do_color_quantize", True)) + output.pop("_do_normalize_imagegpt", None) + return output - # Fast torch-based color quantization - quantized = color_quantize_fast(image_hwc, cluster_tensors) - - # Flatten to sequence (H*W,) - quantized_flat = quantized.view(-1) - quantized_images.append(quantized_flat) - - # Stack all quantized sequences - input_ids = torch.stack(quantized_images, dim=0) - - return BatchFeature(data={"input_ids": input_ids}, tensor_type=return_tensors) - else: - # 5. Standard output without quantizing - pixel_values = torch.stack(processed_images, dim=0) - return BatchFeature(data={"pixel_values": pixel_values}, tensor_type=return_tensors) __all__ = ["ImageGPTImageProcessorFast"] From 0b21bff9e878b3e0a5a9a7c0721c26f0aa295c70 Mon Sep 17 00:00:00 2001 From: Ethan Ayaay Date: Mon, 11 Aug 2025 13:31:39 -0700 Subject: [PATCH 19/33] Merged changes from solution2 test file --- .../test_image_processing_imagegpt.py | 51 ++++++++++--------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/tests/models/imagegpt/test_image_processing_imagegpt.py b/tests/models/imagegpt/test_image_processing_imagegpt.py index 56eaf32895e8..12a83b25c1d8 100644 --- a/tests/models/imagegpt/test_image_processing_imagegpt.py +++ b/tests/models/imagegpt/test_image_processing_imagegpt.py @@ -116,43 +116,46 @@ def test_image_processor_properties(self): self.assertTrue(hasattr(image_processing, "do_normalize")) def test_image_processor_from_dict_with_kwargs(self): - image_processor = self.image_processing_class.from_dict(self.image_processor_dict) - self.assertEqual(image_processor.size, {"height": 18, "width": 18}) + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"height": 18, "width": 18}) - image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42) - self.assertEqual(image_processor.size, {"height": 42, "width": 42}) + image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42) + self.assertEqual(image_processor.size, {"height": 42, "width": 42}) def test_image_processor_to_json_string(self): - image_processor = self.image_processing_class(**self.image_processor_dict) - obj = json.loads(image_processor.to_json_string()) - for key, value in self.image_processor_dict.items(): - if key == "clusters": - self.assertTrue(np.array_equal(value, obj[key])) - else: - self.assertEqual(obj[key], value) + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class(**self.image_processor_dict) + obj = json.loads(image_processor.to_json_string()) + for key, value in self.image_processor_dict.items(): + if key == "clusters": + self.assertTrue(np.array_equal(value, obj[key])) + else: + self.assertEqual(obj[key], value) def test_image_processor_to_json_file(self): - image_processor_first = self.image_processing_class(**self.image_processor_dict) + for image_processing_class in self.image_processor_list: + image_processor_first = image_processing_class(**self.image_processor_dict) - with tempfile.TemporaryDirectory() as tmpdirname: - json_file_path = os.path.join(tmpdirname, "image_processor.json") - image_processor_first.to_json_file(json_file_path) - image_processor_second = self.image_processing_class.from_json_file(json_file_path).to_dict() + with tempfile.TemporaryDirectory() as tmpdirname: + json_file_path = os.path.join(tmpdirname, "image_processor.json") + image_processor_first.to_json_file(json_file_path) + image_processor_second = image_processing_class.from_json_file(json_file_path).to_dict() - image_processor_first = image_processor_first.to_dict() - for key, value in image_processor_first.items(): - if key == "clusters": - self.assertTrue(np.array_equal(value, image_processor_second[key])) - else: - self.assertEqual(image_processor_first[key], value) + image_processor_first = image_processor_first.to_dict() + for key, value in image_processor_first.items(): + if key == "clusters": + self.assertTrue(np.array_equal(value, image_processor_second[key])) + else: + self.assertEqual(image_processor_first[key], value) def test_image_processor_from_and_save_pretrained(self): for image_processing_class in self.image_processor_list: - image_processor_first = self.image_processing_class(**self.image_processor_dict) + image_processor_first = image_processing_class(**self.image_processor_dict) with tempfile.TemporaryDirectory() as tmpdirname: image_processor_first.save_pretrained(tmpdirname) - image_processor_second = self.image_processing_class.from_pretrained(tmpdirname).to_dict() + image_processor_second = image_processing_class.from_pretrained(tmpdirname).to_dict() image_processor_first = image_processor_first.to_dict() for key, value in image_processor_first.items(): From e98d5faab85171af4f49f3082d36fadb165a2733 Mon Sep 17 00:00:00 2001 From: agamjots Date: Mon, 11 Aug 2025 14:05:40 -0700 Subject: [PATCH 20/33] fixed testing through baseImageGPT processor file --- .../imagegpt/image_processing_imagegpt.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt.py b/src/transformers/models/imagegpt/image_processing_imagegpt.py index 5d3e207dd3eb..a264b948848a 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt.py @@ -279,6 +279,12 @@ def preprocess( if do_normalize: images = [self.normalize(image=image, input_data_format=input_data_format) for image in images] + # Need pixel_values (normalized, channels_first) for equivalence tests + pixel_values = [ + to_channel_dimension_format(image, ChannelDimension.FIRST, input_channel_dim=input_data_format) + for image in images + ] + if do_color_quantize: images = [to_channel_dimension_format(image, ChannelDimension.LAST, input_data_format) for image in images] # color quantize from (batch_size, height, width, 3) to (batch_size, height, width) @@ -291,14 +297,21 @@ def preprocess( # We need to convert back to a list of images to keep consistent behaviour across processors. images = list(images) + data = {"input_ids": images, "pixel_values": pixel_values} else: images = [ to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images ] - - data = {"input_ids": images} + data = {"pixel_values": images} return BatchFeature(data=data, tensor_type=return_tensors) + def to_dict(self): + output = super().to_dict() + # Ensure clusters are JSON/equality friendly + if output.get("clusters") is not None and isinstance(output["clusters"], np.ndarray): + output["clusters"] = output["clusters"].tolist() + return output + __all__ = ["ImageGPTImageProcessor"] From f3b0a8c80d6e4534fa9418ad1cfaf19cfdf18525 Mon Sep 17 00:00:00 2001 From: Ethan Ayaay Date: Tue, 12 Aug 2025 11:02:59 -0700 Subject: [PATCH 21/33] Fixed check_code_quality test. Removed unncessary list comprehension. --- .../models/imagegpt/image_processing_imagegpt_fast.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py index 8d62789121f7..cff81df60c77 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py @@ -142,10 +142,7 @@ def _quantize_one_image(image_chw: torch.Tensor, clusters_ref: torch.Tensor) -> pixels = img_hwc.reshape(-1, 3) return color_quantize_torch(pixels, device_clusters) - if isinstance(normalized, torch.Tensor): - images_list = [img for img in normalized] - else: - images_list = list(normalized) + images_list = list(normalized) ids_list = [_quantize_one_image(img, clusters_torch) for img in images_list] From 43c61716443056256d67e196d0d66c2cf7aad22b Mon Sep 17 00:00:00 2001 From: Ethan Ayaay Date: Tue, 12 Aug 2025 11:08:30 -0700 Subject: [PATCH 22/33] reorganized imports in image_processing_imagegpt_fast --- .../models/imagegpt/image_processing_imagegpt_fast.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py index cff81df60c77..c54a89892483 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py @@ -14,9 +14,10 @@ # limitations under the License. """Fast Image processor class for ImageGPT.""" +from typing import Optional, Union + import numpy as np import torch -from typing import Optional, Union from ...image_processing_utils_fast import BaseImageProcessorFast from ...image_utils import PILImageResampling From 5bb6d5aed161ec158ca7f27df88efb36fbdb86f9 Mon Sep 17 00:00:00 2001 From: Ethan Ayaay Date: Tue, 12 Aug 2025 12:12:38 -0700 Subject: [PATCH 23/33] formatted image_processing_imagegpt_fast.py --- .../image_processing_imagegpt_fast.py | 38 +++++++++++++------ 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py index c54a89892483..c34f9e150ab2 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py @@ -36,9 +36,9 @@ def squared_euclidean_distance_torch(a: torch.Tensor, b: torch.Tensor) -> torch. (N, M) tensor of squared distances """ b = b.t() # (3, M) - a2 = torch.sum(a ** 2, dim=1) # (N,) - b2 = torch.sum(b ** 2, dim=0) # (M,) - ab = torch.matmul(a, b) # (N, M) + a2 = torch.sum(a**2, dim=1) # (N,) + b2 = torch.sum(b**2, dim=0) # (M,) + ab = torch.matmul(a, b) # (N, M) d = a2[:, None] - 2 * ab + b2[None, :] return d @@ -110,10 +110,12 @@ def _preprocess( do_color_quantize: Optional[bool] = None, clusters: Optional[Union[list, np.ndarray, torch.Tensor]] = None, return_tensors: Optional[str] = None, - **kwargs + **kwargs, ): # Run standard fast pipeline (resize, crop, batching) without rescale/normalize - base_batch = super()._preprocess(images, return_tensors=return_tensors, **kwargs) + base_batch = super()._preprocess( + images, return_tensors=return_tensors, **kwargs + ) pixel_values = base_batch["pixel_values"] # Tensor [B,C,H,W] or list of [C,H,W] # Apply ImageGPT normalization when requested: [-1, 1] @@ -128,7 +130,11 @@ def _preprocess( normalized = [img / 127.5 - 1.0 for img in normalized] # If color quantization is requested, perform it; otherwise return pixel values - do_color_quantize = do_color_quantize if do_color_quantize is not None else self.do_color_quantize + do_color_quantize = ( + do_color_quantize + if do_color_quantize is not None + else self.do_color_quantize + ) if do_color_quantize: # Prepare clusters clusters = clusters if clusters is not None else self.clusters @@ -137,8 +143,12 @@ def _preprocess( clusters_torch = torch.as_tensor(clusters, dtype=torch.float32) # Helper for clarity: quantize a single image [C,H,W] -> [H*W] - def _quantize_one_image(image_chw: torch.Tensor, clusters_ref: torch.Tensor) -> torch.Tensor: - device_clusters = clusters_ref.to(image_chw.device, dtype=image_chw.dtype) + def _quantize_one_image( + image_chw: torch.Tensor, clusters_ref: torch.Tensor + ) -> torch.Tensor: + device_clusters = clusters_ref.to( + image_chw.device, dtype=image_chw.dtype + ) img_hwc = image_chw.permute(1, 2, 0) pixels = img_hwc.reshape(-1, 3) return color_quantize_torch(pixels, device_clusters) @@ -155,7 +165,11 @@ def _quantize_one_image(image_chw: torch.Tensor, clusters_ref: torch.Tensor) -> pixel_values_out = images_list from ...image_processing_utils import BatchFeature - return BatchFeature(data={"input_ids": input_ids, "pixel_values": pixel_values_out}, tensor_type=return_tensors) + + return BatchFeature( + data={"input_ids": input_ids, "pixel_values": pixel_values_out}, + tensor_type=return_tensors, + ) # Otherwise, return pixel values (normalized or not depending on flag) base_batch["pixel_values"] = normalized @@ -164,14 +178,16 @@ def _quantize_one_image(image_chw: torch.Tensor, clusters_ref: torch.Tensor) -> def to_dict(self): # Convert numpy arrays to lists for JSON serialization output = super().to_dict() - if output.get("clusters") is not None and isinstance(output["clusters"], np.ndarray): + if output.get("clusters") is not None and isinstance( + output["clusters"], np.ndarray + ): output["clusters"] = output["clusters"].tolist() # ImageGPT does not use base mean/std normalization; keep these None for parity with slow processor # output["image_mean"] = None # output["image_std"] = None # No rescaling in fast ImageGPT path - #Need to set these valus to match with slow processor during testing + # Need to set these valus to match with slow processor during testing output["rescale_factor"] = None output["do_rescale"] = None output["do_color_quantize"] = bool(getattr(self, "do_color_quantize", True)) From e575ba74bd625856ca8c22e62ce246250189e373 Mon Sep 17 00:00:00 2001 From: chris Date: Tue, 12 Aug 2025 12:25:43 -0700 Subject: [PATCH 24/33] Added arg documentation --- .../imagegpt/image_processing_imagegpt_fast.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py index c34f9e150ab2..add2527cf1fc 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py @@ -112,6 +112,20 @@ def _preprocess( return_tensors: Optional[str] = None, **kwargs, ): + """ + Preprocess an image or batch of images. + + Args: + clusters (`np.ndarray`, `list[list[float]]`, or `torch.Tensor`, *optional*, defaults to `self.clusters`): + Clusters used to quantize the image of shape `(n_clusters, 3)`. Only has an effect if + `do_color_quantize` is set to `True`. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `torch.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + """ # Run standard fast pipeline (resize, crop, batching) without rescale/normalize base_batch = super()._preprocess( images, return_tensors=return_tensors, **kwargs From 25bd8ac477be3362533ba5bd2986d83ab2c7a921 Mon Sep 17 00:00:00 2001 From: chris Date: Tue, 12 Aug 2025 13:36:25 -0700 Subject: [PATCH 25/33] Added FastImageProcessorKwargs class + Docs for new kwargs --- .../image_processing_imagegpt_fast.py | 59 +++++++++++++++++-- 1 file changed, 54 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py index add2527cf1fc..1f907366f055 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py @@ -17,12 +17,25 @@ from typing import Optional, Union import numpy as np -import torch - -from ...image_processing_utils_fast import BaseImageProcessorFast -from ...image_utils import PILImageResampling -from ...utils import auto_docstring +from ...image_processing_utils_fast import BaseImageProcessorFast, DefaultFastImageProcessorKwargs +from ...image_utils import PILImageResampling, ChannelDimension +from ...utils import ( + TensorType, + auto_docstring, + is_torch_available, + is_torchvision_available, + is_torchvision_v2_available +) + +if is_torch_available(): + import torch + +if is_torchvision_available(): + if is_torchvision_v2_available(): + from torchvision.transforms.v2 import functional as F + else: + from torchvision.transforms import functional as F def squared_euclidean_distance_torch(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor: """ @@ -57,6 +70,42 @@ def color_quantize_torch(x: torch.Tensor, clusters: torch.Tensor) -> torch.Tenso d = squared_euclidean_distance_torch(x, clusters) return torch.argmin(d, dim=1) +class ImageGPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): + """ + clusters (`np.ndarray` or `list[list[int]]`, *optional*): + The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters` + in `preprocess`. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`): + Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `torch.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + do_color_quantize (`bool`, *optional*, defaults to `self.do_color_quantize`): + Whether to color quantize the image. + """ + + clusters: Optional[np.ndarray] = None + resample: Optional[PILImageResampling] = PILImageResampling.BILINEAR + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None + do_color_quantize: Optional[bool] = True + @auto_docstring class ImageGPTImageProcessorFast(BaseImageProcessorFast): From ff89353976336eb5d3c67a8650c7417a181d2aa1 Mon Sep 17 00:00:00 2001 From: chris Date: Tue, 12 Aug 2025 13:39:20 -0700 Subject: [PATCH 26/33] Reformatted previous --- .../image_processing_imagegpt_fast.py | 35 ++++++++----------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py index 1f907366f055..7059a69496f1 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py @@ -18,14 +18,17 @@ import numpy as np -from ...image_processing_utils_fast import BaseImageProcessorFast, DefaultFastImageProcessorKwargs +from ...image_processing_utils_fast import ( + BaseImageProcessorFast, + DefaultFastImageProcessorKwargs, +) from ...image_utils import PILImageResampling, ChannelDimension from ...utils import ( TensorType, auto_docstring, is_torch_available, is_torchvision_available, - is_torchvision_v2_available + is_torchvision_v2_available, ) if is_torch_available(): @@ -37,6 +40,7 @@ else: from torchvision.transforms import functional as F + def squared_euclidean_distance_torch(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor: """ Compute squared Euclidean distances between all pixels and clusters. @@ -70,6 +74,7 @@ def color_quantize_torch(x: torch.Tensor, clusters: torch.Tensor) -> torch.Tenso d = squared_euclidean_distance_torch(x, clusters) return torch.argmin(d, dim=1) + class ImageGPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): """ clusters (`np.ndarray` or `list[list[int]]`, *optional*): @@ -101,8 +106,8 @@ class ImageGPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): clusters: Optional[np.ndarray] = None resample: Optional[PILImageResampling] = PILImageResampling.BILINEAR - return_tensors: Optional[Union[str, TensorType]] = None, - data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST, + return_tensors: Optional[Union[str, TensorType]] = (None,) + data_format: Optional[Union[str, ChannelDimension]] = (ChannelDimension.FIRST,) input_data_format: Optional[Union[str, ChannelDimension]] = None do_color_quantize: Optional[bool] = True @@ -176,9 +181,7 @@ def _preprocess( - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. """ # Run standard fast pipeline (resize, crop, batching) without rescale/normalize - base_batch = super()._preprocess( - images, return_tensors=return_tensors, **kwargs - ) + base_batch = super()._preprocess(images, return_tensors=return_tensors, **kwargs) pixel_values = base_batch["pixel_values"] # Tensor [B,C,H,W] or list of [C,H,W] # Apply ImageGPT normalization when requested: [-1, 1] @@ -193,11 +196,7 @@ def _preprocess( normalized = [img / 127.5 - 1.0 for img in normalized] # If color quantization is requested, perform it; otherwise return pixel values - do_color_quantize = ( - do_color_quantize - if do_color_quantize is not None - else self.do_color_quantize - ) + do_color_quantize = do_color_quantize if do_color_quantize is not None else self.do_color_quantize if do_color_quantize: # Prepare clusters clusters = clusters if clusters is not None else self.clusters @@ -206,12 +205,8 @@ def _preprocess( clusters_torch = torch.as_tensor(clusters, dtype=torch.float32) # Helper for clarity: quantize a single image [C,H,W] -> [H*W] - def _quantize_one_image( - image_chw: torch.Tensor, clusters_ref: torch.Tensor - ) -> torch.Tensor: - device_clusters = clusters_ref.to( - image_chw.device, dtype=image_chw.dtype - ) + def _quantize_one_image(image_chw: torch.Tensor, clusters_ref: torch.Tensor) -> torch.Tensor: + device_clusters = clusters_ref.to(image_chw.device, dtype=image_chw.dtype) img_hwc = image_chw.permute(1, 2, 0) pixels = img_hwc.reshape(-1, 3) return color_quantize_torch(pixels, device_clusters) @@ -241,9 +236,7 @@ def _quantize_one_image( def to_dict(self): # Convert numpy arrays to lists for JSON serialization output = super().to_dict() - if output.get("clusters") is not None and isinstance( - output["clusters"], np.ndarray - ): + if output.get("clusters") is not None and isinstance(output["clusters"], np.ndarray): output["clusters"] = output["clusters"].tolist() # ImageGPT does not use base mean/std normalization; keep these None for parity with slow processor # output["image_mean"] = None From a1b2e7f6fc4b79c2d55bfe9cfea2f90590614762 Mon Sep 17 00:00:00 2001 From: chris Date: Tue, 12 Aug 2025 14:42:09 -0700 Subject: [PATCH 27/33] Added F to normalization --- .../models/imagegpt/image_processing_imagegpt_fast.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py index 7059a69496f1..4e5f05a8f28e 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py @@ -23,6 +23,7 @@ DefaultFastImageProcessorKwargs, ) from ...image_utils import PILImageResampling, ChannelDimension +from ...processing_utils import Unpack from ...utils import ( TensorType, auto_docstring, @@ -137,7 +138,7 @@ class ImageGPTImageProcessorFast(BaseImageProcessorFast): def __init__( self, clusters: Optional[Union[list, np.ndarray]] = None, - **kwargs, + **kwargs: Unpack[ImageGPTFastImageProcessorKwargs], ): super().__init__(**kwargs) # Store clusters as numpy for JSON serializability. Convert to torch in _preprocess when needed. @@ -164,7 +165,7 @@ def _preprocess( do_color_quantize: Optional[bool] = None, clusters: Optional[Union[list, np.ndarray, torch.Tensor]] = None, return_tensors: Optional[str] = None, - **kwargs, + **kwargs: Unpack[ImageGPTFastImageProcessorKwargs], ): """ Preprocess an image or batch of images. @@ -189,11 +190,11 @@ def _preprocess( if isinstance(pixel_values, torch.Tensor): normalized = pixel_values.to(dtype=torch.float32) if do_normalize: - normalized = normalized / 127.5 - 1.0 + normalized = F.normalize(normalized, mean=[0.0], std=[127.5]) - 1.0 else: normalized = [img.to(dtype=torch.float32) for img in pixel_values] if do_normalize: - normalized = [img / 127.5 - 1.0 for img in normalized] + normalized = [F.normalize(img, mean=[0.0], std=[127.5]) - 1.0 for img in normalized] # If color quantization is requested, perform it; otherwise return pixel values do_color_quantize = do_color_quantize if do_color_quantize is not None else self.do_color_quantize From cd4d0637eb8a50e1373205c524784f38d9f83095 Mon Sep 17 00:00:00 2001 From: agamjots Date: Tue, 12 Aug 2025 18:06:09 -0700 Subject: [PATCH 28/33] fixed ruff linting and cleaned up fast processor file --- .../imagegpt/image_processing_imagegpt_fast.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py index 4e5f05a8f28e..b65ac4997e08 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py @@ -22,7 +22,7 @@ BaseImageProcessorFast, DefaultFastImageProcessorKwargs, ) -from ...image_utils import PILImageResampling, ChannelDimension +from ...image_utils import ChannelDimension, PILImageResampling from ...processing_utils import Unpack from ...utils import ( TensorType, @@ -32,6 +32,7 @@ is_torchvision_v2_available, ) + if is_torch_available(): import torch @@ -128,12 +129,12 @@ class ImageGPTImageProcessorFast(BaseImageProcessorFast): resample = PILImageResampling.BILINEAR size = {"height": 256, "width": 256} do_resize = True - # We do NOT use the base normalization/rescale as ImageGPT expects (x/127.5 - 1) + # We do NOT want touse the base normalization/rescale as ImageGPT expects (x/127.5 - 1) do_rescale = False do_normalize = False do_color_quantize = True - clusters = None # Must be set at instantiation + clusters = None def __init__( self, @@ -183,7 +184,7 @@ def _preprocess( """ # Run standard fast pipeline (resize, crop, batching) without rescale/normalize base_batch = super()._preprocess(images, return_tensors=return_tensors, **kwargs) - pixel_values = base_batch["pixel_values"] # Tensor [B,C,H,W] or list of [C,H,W] + pixel_values = base_batch["pixel_values"] # Apply ImageGPT normalization when requested: [-1, 1] do_normalize = getattr(self, "_do_normalize_imagegpt", True) @@ -205,7 +206,6 @@ def _preprocess( raise ValueError("Clusters must be provided for color quantization.") clusters_torch = torch.as_tensor(clusters, dtype=torch.float32) - # Helper for clarity: quantize a single image [C,H,W] -> [H*W] def _quantize_one_image(image_chw: torch.Tensor, clusters_ref: torch.Tensor) -> torch.Tensor: device_clusters = clusters_ref.to(image_chw.device, dtype=image_chw.dtype) img_hwc = image_chw.permute(1, 2, 0) @@ -239,10 +239,6 @@ def to_dict(self): output = super().to_dict() if output.get("clusters") is not None and isinstance(output["clusters"], np.ndarray): output["clusters"] = output["clusters"].tolist() - # ImageGPT does not use base mean/std normalization; keep these None for parity with slow processor - # output["image_mean"] = None - # output["image_std"] = None - # No rescaling in fast ImageGPT path # Need to set these valus to match with slow processor during testing output["rescale_factor"] = None From 1315a98c3307475658d5facb471ff1b35c83b75f Mon Sep 17 00:00:00 2001 From: agamjots Date: Thu, 21 Aug 2025 22:40:03 -0700 Subject: [PATCH 29/33] implemented requested changes --- .../imagegpt/image_processing_imagegpt.py | 13 +- .../image_processing_imagegpt_fast.py | 259 ++++++++++-------- .../test_image_processing_imagegpt.py | 53 ++++ 3 files changed, 202 insertions(+), 123 deletions(-) diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt.py b/src/transformers/models/imagegpt/image_processing_imagegpt.py index a264b948848a..c29e436f17fa 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt.py @@ -297,10 +297,10 @@ def preprocess( # We need to convert back to a list of images to keep consistent behaviour across processors. images = list(images) - data = {"input_ids": images, "pixel_values": pixel_values} + data = {"input_ids": images} else: images = [ - to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + to_channel_dimension_format(image, data_format, input_data_format) for image in images ] data = {"pixel_values": images} @@ -311,6 +311,15 @@ def to_dict(self): # Ensure clusters are JSON/equality friendly if output.get("clusters") is not None and isinstance(output["clusters"], np.ndarray): output["clusters"] = output["clusters"].tolist() + # Need to set missing keys from slow processor to match the expected behavior in save/load tests compared to fast processor + missing_keys = [ + "image_mean", "image_std", + "rescale_factor", "do_rescale" + ] + for key in missing_keys: + if key in output: + output[key] = None + return output diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py index b65ac4997e08..ffe5f915be38 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py @@ -18,11 +18,13 @@ import numpy as np +from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import ( BaseImageProcessorFast, DefaultFastImageProcessorKwargs, ) -from ...image_utils import ChannelDimension, PILImageResampling +from ...image_transforms import group_images_by_shape, reorder_images +from ...image_utils import PILImageResampling from ...processing_utils import Unpack from ...utils import ( TensorType, @@ -58,8 +60,8 @@ def squared_euclidean_distance_torch(a: torch.Tensor, b: torch.Tensor) -> torch. a2 = torch.sum(a**2, dim=1) # (N,) b2 = torch.sum(b**2, dim=0) # (M,) ab = torch.matmul(a, b) # (N, M) - d = a2[:, None] - 2 * ab + b2[None, :] - return d + d = a2[:, None] - 2 * ab + b2[None, :] # Squared Euclidean Distance: a^2 - 2ab + b^2 + return d # (N, M) tensor of squared distances def color_quantize_torch(x: torch.Tensor, clusters: torch.Tensor) -> torch.Tensor: @@ -84,118 +86,80 @@ class ImageGPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): in `preprocess`. resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`): Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`. - return_tensors (`str` or `TensorType`, *optional*): - The type of tensors to return. Can be one of: - - Unset: Return a list of `torch.Tensor`. - - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. - - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. - - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. - data_format (`ChannelDimension` or `str`, *optional*): - The channel dimension format for the output image. If unset, the channel dimension format of the input - image is used. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - input_data_format (`ChannelDimension` or `str`, *optional*): - The channel dimension format for the input image. If unset, the channel dimension format is inferred - from the input image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - do_color_quantize (`bool`, *optional*, defaults to `self.do_color_quantize`): - Whether to color quantize the image. """ clusters: Optional[np.ndarray] = None resample: Optional[PILImageResampling] = PILImageResampling.BILINEAR - return_tensors: Optional[Union[str, TensorType]] = (None,) - data_format: Optional[Union[str, ChannelDimension]] = (ChannelDimension.FIRST,) - input_data_format: Optional[Union[str, ChannelDimension]] = None do_color_quantize: Optional[bool] = True @auto_docstring class ImageGPTImageProcessorFast(BaseImageProcessorFast): - """ - Constructs a fast ImageGPT image processor. - - This processor can be used to resize images to a smaller resolution (such as 32x32 or 64x64), - normalize them and finally color quantize them to obtain sequences of "pixel values" (color clusters). - """ - model_input_names = ["input_ids"] - - # Defaults largely aligned with the slow processor, except normalization which we do manually to [-1, 1] resample = PILImageResampling.BILINEAR - size = {"height": 256, "width": 256} - do_resize = True - # We do NOT want touse the base normalization/rescale as ImageGPT expects (x/127.5 - 1) - do_rescale = False - do_normalize = False - do_color_quantize = True clusters = None + # Use standard normalization with image_mean=[0.5, 0.5, 0.5] and image_std=[0.5, 0.5, 0.5] + # This is equivalent to ImageGPT's (x/127.5 - 1) normalization + image_mean = [0.5, 0.5, 0.5] + image_std = [0.5, 0.5, 0.5] + do_rescale = True + do_normalize = True + + # We are keeping this for backwards compatibility def __init__( self, clusters: Optional[Union[list, np.ndarray]] = None, **kwargs: Unpack[ImageGPTFastImageProcessorKwargs], ): super().__init__(**kwargs) - # Store clusters as numpy for JSON serializability. Convert to torch in _preprocess when needed. - if clusters is not None: - self.clusters = np.asarray(clusters, dtype=np.float32) - else: - self.clusters = None - # Default: follow ImageGPT behavior (normalize by default). We stash here and force base to skip. - self._do_normalize_imagegpt = kwargs.get("do_normalize", True) - - def _further_process_kwargs(self, **kwargs): - # Let the base process size/crop and other standard kwargs first - kwargs = super()._further_process_kwargs(**kwargs) - if "do_normalize" in kwargs and kwargs["do_normalize"] is not None: - self._do_normalize_imagegpt = kwargs["do_normalize"] - # Force base pipeline to skip its rescale/normalize validation and logic - kwargs["do_rescale"] = False - kwargs["do_normalize"] = False - return kwargs - + # Store clusters as torch tensor directly for efficiency + self.clusters = torch.tensor(clusters, dtype=torch.float32) if clusters is not None else None def _preprocess( self, - images, + images: list["torch.Tensor"], + do_resize: bool, + size: dict[str, int], + interpolation: Optional["F.InterpolationMode"], + do_center_crop: bool, + crop_size: dict[str, int], + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Optional[Union[float, list[float]]], + image_std: Optional[Union[float, list[float]]], do_color_quantize: Optional[bool] = None, clusters: Optional[Union[list, np.ndarray, torch.Tensor]] = None, - return_tensors: Optional[str] = None, - **kwargs: Unpack[ImageGPTFastImageProcessorKwargs], + disable_grouping: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + **kwargs, ): - """ - Preprocess an image or batch of images. - - Args: - clusters (`np.ndarray`, `list[list[float]]`, or `torch.Tensor`, *optional*, defaults to `self.clusters`): - Clusters used to quantize the image of shape `(n_clusters, 3)`. Only has an effect if - `do_color_quantize` is set to `True`. - return_tensors (`str` or `TensorType`, *optional*): - The type of tensors to return. Can be one of: - - Unset: Return a list of `torch.Tensor`. - - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. - - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. - - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. - """ - # Run standard fast pipeline (resize, crop, batching) without rescale/normalize - base_batch = super()._preprocess(images, return_tensors=return_tensors, **kwargs) - pixel_values = base_batch["pixel_values"] - - # Apply ImageGPT normalization when requested: [-1, 1] - do_normalize = getattr(self, "_do_normalize_imagegpt", True) - if isinstance(pixel_values, torch.Tensor): - normalized = pixel_values.to(dtype=torch.float32) - if do_normalize: - normalized = F.normalize(normalized, mean=[0.0], std=[127.5]) - 1.0 - else: - normalized = [img.to(dtype=torch.float32) for img in pixel_values] - if do_normalize: - normalized = [F.normalize(img, mean=[0.0], std=[127.5]) - 1.0 for img in normalized] + # Unrolled standard image processing pipeline for clarity + + # Group images by size for batched resizing + grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) + resized_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_resize: + stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation) + resized_images_grouped[shape] = stacked_images + resized_images = reorder_images(resized_images_grouped, grouped_images_index) + + # Group images by size for further processing + # Needed in case do_resize is False, or resize returns images with different sizes + grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping) + processed_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_center_crop: + stacked_images = self.center_crop(stacked_images, crop_size) + # Fused rescale and normalize + stacked_images = self.rescale_and_normalize( + stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std + ) + processed_images_grouped[shape] = stacked_images + + pixel_values = reorder_images(processed_images_grouped, grouped_images_index) # If color quantization is requested, perform it; otherwise return pixel values do_color_quantize = do_color_quantize if do_color_quantize is not None else self.do_color_quantize @@ -204,47 +168,100 @@ def _preprocess( clusters = clusters if clusters is not None else self.clusters if clusters is None: raise ValueError("Clusters must be provided for color quantization.") - clusters_torch = torch.as_tensor(clusters, dtype=torch.float32) - - def _quantize_one_image(image_chw: torch.Tensor, clusters_ref: torch.Tensor) -> torch.Tensor: - device_clusters = clusters_ref.to(image_chw.device, dtype=image_chw.dtype) - img_hwc = image_chw.permute(1, 2, 0) - pixels = img_hwc.reshape(-1, 3) - return color_quantize_torch(pixels, device_clusters) - - images_list = list(normalized) - - ids_list = [_quantize_one_image(img, clusters_torch) for img in images_list] - - if return_tensors == "pt": - input_ids = torch.stack(ids_list, dim=0) - pixel_values_out = torch.stack(images_list, dim=0) + # Convert to torch tensor if needed (clusters might be passed as list/numpy) + clusters_torch = torch.as_tensor(clusters, dtype=torch.float32) if not isinstance(clusters, torch.Tensor) else clusters + + # Group images by shape for batch processing + # We need to check if the pixel values are a tensor or a list of tensors + if isinstance(pixel_values, torch.Tensor): + # Single batch case + images_list = [pixel_values] else: - input_ids = ids_list - pixel_values_out = images_list - - from ...image_processing_utils import BatchFeature - + # Multiple images case, we group by shape + shape_groups = {} + for i, img in enumerate(pixel_values): + shape = img.shape + if shape not in shape_groups: + shape_groups[shape] = [] + shape_groups[shape].append((i, img)) + + images_list = [] + for shape, group in shape_groups.items(): + if len(group) > 1: + # Batch process images of same shape + batch_imgs = torch.stack([img for _, img in group]) + images_list.append((batch_imgs, [idx for idx, _ in group])) + else: + # Single image + idx, img = group[0] + images_list.append((img.unsqueeze(0), [idx])) + + # Process each group + all_input_ids = [None] * len(pixel_values) if not isinstance(pixel_values, torch.Tensor) else None + + for group_data in images_list: + if isinstance(pixel_values, torch.Tensor): + # Single batch case + batch_imgs = group_data + batch_size = batch_imgs.shape[0] + # Convert from CHW to HWC and flatten + batch_hwc = batch_imgs.permute(0, 2, 3, 1) # (B, H, W, C) + batch_flat = batch_hwc.reshape(batch_size, -1, 3) # (B, H*W, C) + + # Quantize each image in the batch + device_clusters = clusters_torch.to(batch_flat.device, dtype=batch_flat.dtype) + input_ids = color_quantize_torch(batch_flat.reshape(-1, 3), device_clusters) + input_ids = input_ids.reshape(batch_size, -1) # (B, H*W) + + return BatchFeature( + data={"input_ids": input_ids}, + tensor_type=return_tensors, + ) + else: + # Multiple images case + batch_imgs, indices = group_data + if batch_imgs.shape[0] == 1: + # Single image + img = batch_imgs.squeeze(0) + img_hwc = img.permute(1, 2, 0) # (H, W, C) + pixels = img_hwc.reshape(-1, 3) # (H*W, C) + + device_clusters = clusters_torch.to(pixels.device, dtype=pixels.dtype) + input_ids = color_quantize_torch(pixels, device_clusters) + + all_input_ids[indices[0]] = input_ids + else: + # Batch of same shape + batch_hwc = batch_imgs.permute(0, 2, 3, 1) # (B, H, W, C) + batch_flat = batch_hwc.reshape(batch_imgs.shape[0], -1, 3) # (B, H*W, C) + + device_clusters = clusters_torch.to(batch_flat.device, dtype=batch_flat.dtype) + input_ids = color_quantize_torch(batch_flat.reshape(-1, 3), device_clusters) + input_ids = input_ids.reshape(batch_imgs.shape[0], -1) # (B, H*W) + + for i, idx in enumerate(indices): + all_input_ids[idx] = input_ids[i] + + # Stack input_ids if returning tensors + if return_tensors: + all_input_ids = torch.stack(all_input_ids, dim=0) return BatchFeature( - data={"input_ids": input_ids, "pixel_values": pixel_values_out}, + data={"input_ids": all_input_ids}, tensor_type=return_tensors, ) - # Otherwise, return pixel values (normalized or not depending on flag) - base_batch["pixel_values"] = normalized - return base_batch + # Otherwise, return normalized pixel values + pixel_values = torch.stack(pixel_values, dim=0) if return_tensors else pixel_values + return BatchFeature(data={"pixel_values": pixel_values}, tensor_type=return_tensors) def to_dict(self): - # Convert numpy arrays to lists for JSON serialization + # Convert torch tensors to lists for JSON serialization output = super().to_dict() - if output.get("clusters") is not None and isinstance(output["clusters"], np.ndarray): + if output.get("clusters") is not None and isinstance(output["clusters"], torch.Tensor): output["clusters"] = output["clusters"].tolist() - # Need to set these valus to match with slow processor during testing - output["rescale_factor"] = None - output["do_rescale"] = None - output["do_color_quantize"] = bool(getattr(self, "do_color_quantize", True)) - output.pop("_do_normalize_imagegpt", None) + # Ensure we match the slow processor's configuration + output["do_color_quantize"] = True return output diff --git a/tests/models/imagegpt/test_image_processing_imagegpt.py b/tests/models/imagegpt/test_image_processing_imagegpt.py index 12a83b25c1d8..21ec7e7f0a8b 100644 --- a/tests/models/imagegpt/test_image_processing_imagegpt.py +++ b/tests/models/imagegpt/test_image_processing_imagegpt.py @@ -256,6 +256,59 @@ def test_call_pytorch(self): (self.image_processor_tester.batch_size, *expected_output_image_shape), ) + def test_slow_fast_equivalence(self): + if self.fast_image_processing_class is None: + self.skipTest("Fast image processing class not available") + + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = image_processing_class(**self.image_processor_dict) + # create random numpy tensors + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True) + for image in image_inputs: + self.assertIsInstance(image, np.ndarray) + + # Test not batched input + encoding_slow = self.image_processing_class(**self.image_processor_dict)(image_inputs[0], return_tensors="pt") + encoding_fast = self.fast_image_processing_class(**self.image_processor_dict)(image_inputs[0], return_tensors="pt") + # Convert to float for mean calculation since input_ids are integers + slow_tensor = encoding_slow.input_ids.float() + fast_tensor = encoding_fast.input_ids.float() + # For quantization-based processors, use absolute tolerance only to avoid infinity issues + # when one value is 0 and the other is 1. The rtol=0 prevents relative tolerance calculation. + self._assert_slow_fast_tensors_equivalence(slow_tensor, fast_tensor, atol=1.0, rtol=0) + + # Test batched + encoding_slow = self.image_processing_class(**self.image_processor_dict)(image_inputs, return_tensors="pt") + encoding_fast = self.fast_image_processing_class(**self.image_processor_dict)(image_inputs, return_tensors="pt") + # Convert to float for mean calculation since input_ids are integers + slow_tensor = encoding_slow.input_ids.float() + fast_tensor = encoding_fast.input_ids.float() + # Once again using absolute tolerance only to avoid infinity issues + self._assert_slow_fast_tensors_equivalence(slow_tensor, fast_tensor, atol=1.0, rtol=0) + + def test_slow_fast_equivalence_batched(self): + if self.fast_image_processing_class is None: + self.skipTest("Fast image processing class not available") + + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = image_processing_class(**self.image_processor_dict) + # create random numpy tensors + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True) + for image in image_inputs: + self.assertIsInstance(image, np.ndarray) + + # Test batched + encoding_slow = self.image_processing_class(**self.image_processor_dict)(image_inputs, return_tensors="pt") + encoding_fast = self.fast_image_processing_class(**self.image_processor_dict)(image_inputs, return_tensors="pt") + # Convert to float for mean calculation since input_ids are integers + slow_tensor = encoding_slow.input_ids.float() + fast_tensor = encoding_fast.input_ids.float() + # For quantization-based processors, use absolute tolerance only to avoid infinity issues + # when one value is 0 and the other is 1. The rtol=0 prevents relative tolerance calculation. + self._assert_slow_fast_tensors_equivalence(slow_tensor, fast_tensor, atol=1.0, rtol=0) + def prepare_images(): # we use revision="refs/pr/1" until the PR is merged From 0f34fd1b10462c001230ffba75045d22b8fc2326 Mon Sep 17 00:00:00 2001 From: agamjots Date: Thu, 21 Aug 2025 23:25:39 -0700 Subject: [PATCH 30/33] fixed ruff checks --- .../models/imagegpt/image_processing_imagegpt.py | 7 +------ tests/models/imagegpt/test_image_processing_imagegpt.py | 4 ---- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt.py b/src/transformers/models/imagegpt/image_processing_imagegpt.py index c29e436f17fa..f479026a4527 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt.py @@ -279,11 +279,6 @@ def preprocess( if do_normalize: images = [self.normalize(image=image, input_data_format=input_data_format) for image in images] - # Need pixel_values (normalized, channels_first) for equivalence tests - pixel_values = [ - to_channel_dimension_format(image, ChannelDimension.FIRST, input_channel_dim=input_data_format) - for image in images - ] if do_color_quantize: images = [to_channel_dimension_format(image, ChannelDimension.LAST, input_data_format) for image in images] @@ -319,7 +314,7 @@ def to_dict(self): for key in missing_keys: if key in output: output[key] = None - + return output diff --git a/tests/models/imagegpt/test_image_processing_imagegpt.py b/tests/models/imagegpt/test_image_processing_imagegpt.py index 21ec7e7f0a8b..998bb0d665bb 100644 --- a/tests/models/imagegpt/test_image_processing_imagegpt.py +++ b/tests/models/imagegpt/test_image_processing_imagegpt.py @@ -261,8 +261,6 @@ def test_slow_fast_equivalence(self): self.skipTest("Fast image processing class not available") for image_processing_class in self.image_processor_list: - # Initialize image_processing - image_processing = image_processing_class(**self.image_processor_dict) # create random numpy tensors image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True) for image in image_inputs: @@ -292,8 +290,6 @@ def test_slow_fast_equivalence_batched(self): self.skipTest("Fast image processing class not available") for image_processing_class in self.image_processor_list: - # Initialize image_processing - image_processing = image_processing_class(**self.image_processor_dict) # create random numpy tensors image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True) for image in image_inputs: From 4f34393539273c96c6202540e0ea564371880d2a Mon Sep 17 00:00:00 2001 From: agamjots Date: Thu, 21 Aug 2025 23:30:10 -0700 Subject: [PATCH 31/33] fixed formatting issues --- .../models/imagegpt/image_processing_imagegpt.py | 11 ++--------- .../imagegpt/image_processing_imagegpt_fast.py | 7 +++++-- .../imagegpt/test_image_processing_imagegpt.py | 16 ++++++++++++---- 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt.py b/src/transformers/models/imagegpt/image_processing_imagegpt.py index f479026a4527..1f2026627515 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt.py @@ -279,7 +279,6 @@ def preprocess( if do_normalize: images = [self.normalize(image=image, input_data_format=input_data_format) for image in images] - if do_color_quantize: images = [to_channel_dimension_format(image, ChannelDimension.LAST, input_data_format) for image in images] # color quantize from (batch_size, height, width, 3) to (batch_size, height, width) @@ -294,10 +293,7 @@ def preprocess( images = list(images) data = {"input_ids": images} else: - images = [ - to_channel_dimension_format(image, data_format, input_data_format) - for image in images - ] + images = [to_channel_dimension_format(image, data_format, input_data_format) for image in images] data = {"pixel_values": images} return BatchFeature(data=data, tensor_type=return_tensors) @@ -307,10 +303,7 @@ def to_dict(self): if output.get("clusters") is not None and isinstance(output["clusters"], np.ndarray): output["clusters"] = output["clusters"].tolist() # Need to set missing keys from slow processor to match the expected behavior in save/load tests compared to fast processor - missing_keys = [ - "image_mean", "image_std", - "rescale_factor", "do_rescale" - ] + missing_keys = ["image_mean", "image_std", "rescale_factor", "do_rescale"] for key in missing_keys: if key in output: output[key] = None diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py index ffe5f915be38..53d32ffd1e70 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py @@ -61,7 +61,7 @@ def squared_euclidean_distance_torch(a: torch.Tensor, b: torch.Tensor) -> torch. b2 = torch.sum(b**2, dim=0) # (M,) ab = torch.matmul(a, b) # (N, M) d = a2[:, None] - 2 * ab + b2[None, :] # Squared Euclidean Distance: a^2 - 2ab + b^2 - return d # (N, M) tensor of squared distances + return d # (N, M) tensor of squared distances def color_quantize_torch(x: torch.Tensor, clusters: torch.Tensor) -> torch.Tensor: @@ -116,6 +116,7 @@ def __init__( super().__init__(**kwargs) # Store clusters as torch tensor directly for efficiency self.clusters = torch.tensor(clusters, dtype=torch.float32) if clusters is not None else None + def _preprocess( self, images: list["torch.Tensor"], @@ -169,7 +170,9 @@ def _preprocess( if clusters is None: raise ValueError("Clusters must be provided for color quantization.") # Convert to torch tensor if needed (clusters might be passed as list/numpy) - clusters_torch = torch.as_tensor(clusters, dtype=torch.float32) if not isinstance(clusters, torch.Tensor) else clusters + clusters_torch = ( + torch.as_tensor(clusters, dtype=torch.float32) if not isinstance(clusters, torch.Tensor) else clusters + ) # Group images by shape for batch processing # We need to check if the pixel values are a tensor or a list of tensors diff --git a/tests/models/imagegpt/test_image_processing_imagegpt.py b/tests/models/imagegpt/test_image_processing_imagegpt.py index 998bb0d665bb..713325b1dc20 100644 --- a/tests/models/imagegpt/test_image_processing_imagegpt.py +++ b/tests/models/imagegpt/test_image_processing_imagegpt.py @@ -267,8 +267,12 @@ def test_slow_fast_equivalence(self): self.assertIsInstance(image, np.ndarray) # Test not batched input - encoding_slow = self.image_processing_class(**self.image_processor_dict)(image_inputs[0], return_tensors="pt") - encoding_fast = self.fast_image_processing_class(**self.image_processor_dict)(image_inputs[0], return_tensors="pt") + encoding_slow = self.image_processing_class(**self.image_processor_dict)( + image_inputs[0], return_tensors="pt" + ) + encoding_fast = self.fast_image_processing_class(**self.image_processor_dict)( + image_inputs[0], return_tensors="pt" + ) # Convert to float for mean calculation since input_ids are integers slow_tensor = encoding_slow.input_ids.float() fast_tensor = encoding_fast.input_ids.float() @@ -278,7 +282,9 @@ def test_slow_fast_equivalence(self): # Test batched encoding_slow = self.image_processing_class(**self.image_processor_dict)(image_inputs, return_tensors="pt") - encoding_fast = self.fast_image_processing_class(**self.image_processor_dict)(image_inputs, return_tensors="pt") + encoding_fast = self.fast_image_processing_class(**self.image_processor_dict)( + image_inputs, return_tensors="pt" + ) # Convert to float for mean calculation since input_ids are integers slow_tensor = encoding_slow.input_ids.float() fast_tensor = encoding_fast.input_ids.float() @@ -297,7 +303,9 @@ def test_slow_fast_equivalence_batched(self): # Test batched encoding_slow = self.image_processing_class(**self.image_processor_dict)(image_inputs, return_tensors="pt") - encoding_fast = self.fast_image_processing_class(**self.image_processor_dict)(image_inputs, return_tensors="pt") + encoding_fast = self.fast_image_processing_class(**self.image_processor_dict)( + image_inputs, return_tensors="pt" + ) # Convert to float for mean calculation since input_ids are integers slow_tensor = encoding_slow.input_ids.float() fast_tensor = encoding_fast.input_ids.float() From 898a80768da87d7e45668477ec0ca6713ee938a3 Mon Sep 17 00:00:00 2001 From: agamjots Date: Mon, 25 Aug 2025 21:32:06 -0700 Subject: [PATCH 32/33] fix(ruff after merging main) --- tests/models/imagegpt/test_image_processing_imagegpt.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/models/imagegpt/test_image_processing_imagegpt.py b/tests/models/imagegpt/test_image_processing_imagegpt.py index d64b6dde3cd0..90ed10aee5df 100644 --- a/tests/models/imagegpt/test_image_processing_imagegpt.py +++ b/tests/models/imagegpt/test_image_processing_imagegpt.py @@ -142,7 +142,6 @@ def test_image_processor_to_json_file(self): image_processor_first.to_json_file(json_file_path) image_processor_second = image_processing_class.from_json_file(json_file_path).to_dict() - image_processor_first = image_processor_first.to_dict() for key, value in image_processor_first.items(): if key == "clusters": @@ -150,7 +149,6 @@ def test_image_processor_to_json_file(self): else: self.assertEqual(image_processor_first[key], value) - def test_image_processor_from_and_save_pretrained(self): for image_processing_class in self.image_processor_list: image_processor_first = image_processing_class(**self.image_processor_dict) From f3815ce15f681d87225ae6f02845b6fdf39d27f2 Mon Sep 17 00:00:00 2001 From: yonigozlan Date: Thu, 4 Sep 2025 22:35:22 +0000 Subject: [PATCH 33/33] simplify logic and reuse standard equivalenec tests --- .../imagegpt/image_processing_imagegpt.py | 6 +- .../image_processing_imagegpt_fast.py | 126 +++++------------- .../test_image_processing_imagegpt.py | 117 +++++++++------- 3 files changed, 104 insertions(+), 145 deletions(-) diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt.py b/src/transformers/models/imagegpt/image_processing_imagegpt.py index 66f4f28f083e..1f2026627515 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt.py @@ -26,7 +26,7 @@ PILImageResampling, infer_channel_dimension_format, is_scaled_image, - make_flat_list_of_images, + make_list_of_images, to_numpy_array, valid_images, validate_preprocess_arguments, @@ -238,7 +238,7 @@ def preprocess( clusters = clusters if clusters is not None else self.clusters clusters = np.array(clusters) - images = make_flat_list_of_images(images) + images = make_list_of_images(images) if not valid_images(images): raise ValueError( @@ -247,7 +247,7 @@ def preprocess( ) # Here, normalize() is using a constant factor to divide pixel values. - # hence, the method does not need image_mean and image_std. + # hence, the method does not need iamge_mean and image_std. validate_preprocess_arguments( do_resize=do_resize, size=size, diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py index 53d32ffd1e70..736666fd28a0 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py @@ -81,16 +81,16 @@ def color_quantize_torch(x: torch.Tensor, clusters: torch.Tensor) -> torch.Tenso class ImageGPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): """ - clusters (`np.ndarray` or `list[list[int]]`, *optional*): + clusters (`np.ndarray` or `list[list[int]]` or `torch.Tensor`, *optional*): The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters` in `preprocess`. - resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`): - Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`. + do_color_quantize (`bool`, *optional*, defaults to `True`): + Controls whether to apply color quantization to convert continuous pixel values to discrete cluster indices. + When True, each pixel is assigned to its nearest color cluster, enabling ImageGPT's discrete token modeling. """ - clusters: Optional[np.ndarray] = None - resample: Optional[PILImageResampling] = PILImageResampling.BILINEAR - do_color_quantize: Optional[bool] = True + clusters: Optional[Union[np.ndarray, list[list[int]], torch.Tensor]] + do_color_quantize: Optional[bool] @auto_docstring @@ -99,23 +99,24 @@ class ImageGPTImageProcessorFast(BaseImageProcessorFast): resample = PILImageResampling.BILINEAR do_color_quantize = True clusters = None - - # Use standard normalization with image_mean=[0.5, 0.5, 0.5] and image_std=[0.5, 0.5, 0.5] - # This is equivalent to ImageGPT's (x/127.5 - 1) normalization image_mean = [0.5, 0.5, 0.5] image_std = [0.5, 0.5, 0.5] do_rescale = True do_normalize = True + valid_kwargs = ImageGPTFastImageProcessorKwargs - # We are keeping this for backwards compatibility def __init__( self, - clusters: Optional[Union[list, np.ndarray]] = None, + clusters: Optional[Union[list, np.ndarray, torch.Tensor]] = None, # keep as arg for backwards compatibility **kwargs: Unpack[ImageGPTFastImageProcessorKwargs], ): - super().__init__(**kwargs) - # Store clusters as torch tensor directly for efficiency - self.clusters = torch.tensor(clusters, dtype=torch.float32) if clusters is not None else None + r""" + clusters (`np.ndarray` or `list[list[int]]` or `torch.Tensor`, *optional*): + The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters` + in `preprocess`. + """ + clusters = torch.as_tensor(clusters, dtype=torch.float32) if clusters is not None else None + super().__init__(clusters=clusters, **kwargs) def _preprocess( self, @@ -136,8 +137,6 @@ def _preprocess( return_tensors: Optional[Union[str, TensorType]] = None, **kwargs, ): - # Unrolled standard image processing pipeline for clarity - # Group images by size for batched resizing grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) resized_images_grouped = {} @@ -163,97 +162,38 @@ def _preprocess( pixel_values = reorder_images(processed_images_grouped, grouped_images_index) # If color quantization is requested, perform it; otherwise return pixel values - do_color_quantize = do_color_quantize if do_color_quantize is not None else self.do_color_quantize if do_color_quantize: # Prepare clusters - clusters = clusters if clusters is not None else self.clusters if clusters is None: raise ValueError("Clusters must be provided for color quantization.") # Convert to torch tensor if needed (clusters might be passed as list/numpy) clusters_torch = ( torch.as_tensor(clusters, dtype=torch.float32) if not isinstance(clusters, torch.Tensor) else clusters - ) + ).to(pixel_values[0].device, dtype=pixel_values[0].dtype) # Group images by shape for batch processing # We need to check if the pixel values are a tensor or a list of tensors - if isinstance(pixel_values, torch.Tensor): - # Single batch case - images_list = [pixel_values] - else: - # Multiple images case, we group by shape - shape_groups = {} - for i, img in enumerate(pixel_values): - shape = img.shape - if shape not in shape_groups: - shape_groups[shape] = [] - shape_groups[shape].append((i, img)) - - images_list = [] - for shape, group in shape_groups.items(): - if len(group) > 1: - # Batch process images of same shape - batch_imgs = torch.stack([img for _, img in group]) - images_list.append((batch_imgs, [idx for idx, _ in group])) - else: - # Single image - idx, img = group[0] - images_list.append((img.unsqueeze(0), [idx])) - + grouped_images, grouped_images_index = group_images_by_shape( + pixel_values, disable_grouping=disable_grouping + ) # Process each group - all_input_ids = [None] * len(pixel_values) if not isinstance(pixel_values, torch.Tensor) else None - - for group_data in images_list: - if isinstance(pixel_values, torch.Tensor): - # Single batch case - batch_imgs = group_data - batch_size = batch_imgs.shape[0] - # Convert from CHW to HWC and flatten - batch_hwc = batch_imgs.permute(0, 2, 3, 1) # (B, H, W, C) - batch_flat = batch_hwc.reshape(batch_size, -1, 3) # (B, H*W, C) - - # Quantize each image in the batch - device_clusters = clusters_torch.to(batch_flat.device, dtype=batch_flat.dtype) - input_ids = color_quantize_torch(batch_flat.reshape(-1, 3), device_clusters) - input_ids = input_ids.reshape(batch_size, -1) # (B, H*W) - - return BatchFeature( - data={"input_ids": input_ids}, - tensor_type=return_tensors, - ) - else: - # Multiple images case - batch_imgs, indices = group_data - if batch_imgs.shape[0] == 1: - # Single image - img = batch_imgs.squeeze(0) - img_hwc = img.permute(1, 2, 0) # (H, W, C) - pixels = img_hwc.reshape(-1, 3) # (H*W, C) - - device_clusters = clusters_torch.to(pixels.device, dtype=pixels.dtype) - input_ids = color_quantize_torch(pixels, device_clusters) - - all_input_ids[indices[0]] = input_ids - else: - # Batch of same shape - batch_hwc = batch_imgs.permute(0, 2, 3, 1) # (B, H, W, C) - batch_flat = batch_hwc.reshape(batch_imgs.shape[0], -1, 3) # (B, H*W, C) - - device_clusters = clusters_torch.to(batch_flat.device, dtype=batch_flat.dtype) - input_ids = color_quantize_torch(batch_flat.reshape(-1, 3), device_clusters) - input_ids = input_ids.reshape(batch_imgs.shape[0], -1) # (B, H*W) - - for i, idx in enumerate(indices): - all_input_ids[idx] = input_ids[i] - - # Stack input_ids if returning tensors - if return_tensors: - all_input_ids = torch.stack(all_input_ids, dim=0) + input_ids_grouped = {} + + for shape, stacked_images in grouped_images.items(): + input_ids = color_quantize_torch( + stacked_images.permute(0, 2, 3, 1).reshape(-1, 3), clusters_torch + ) # (B*H*W, C) + input_ids_grouped[shape] = input_ids.reshape(stacked_images.shape[0], -1).reshape( + stacked_images.shape[0], -1 + ) # (B, H, W) + + input_ids = reorder_images(input_ids_grouped, grouped_images_index) + return BatchFeature( - data={"input_ids": all_input_ids}, + data={"input_ids": torch.stack(input_ids, dim=0) if return_tensors else input_ids}, tensor_type=return_tensors, ) - # Otherwise, return normalized pixel values pixel_values = torch.stack(pixel_values, dim=0) if return_tensors else pixel_values return BatchFeature(data={"pixel_values": pixel_values}, tensor_type=return_tensors) @@ -263,8 +203,6 @@ def to_dict(self): if output.get("clusters") is not None and isinstance(output["clusters"], torch.Tensor): output["clusters"] = output["clusters"].tolist() - # Ensure we match the slow processor's configuration - output["do_color_quantize"] = True return output diff --git a/tests/models/imagegpt/test_image_processing_imagegpt.py b/tests/models/imagegpt/test_image_processing_imagegpt.py index 90ed10aee5df..8c04d9585022 100644 --- a/tests/models/imagegpt/test_image_processing_imagegpt.py +++ b/tests/models/imagegpt/test_image_processing_imagegpt.py @@ -19,10 +19,20 @@ import unittest import numpy as np +import pytest +import requests from datasets import load_dataset +from packaging import version from transformers import AutoImageProcessor -from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_vision, slow +from transformers.testing_utils import ( + check_json_file_has_correct_format, + require_torch, + require_torch_accelerator, + require_vision, + slow, + torch_device, +) from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs @@ -256,62 +266,73 @@ def test_call_pytorch(self): (self.image_processor_tester.batch_size, *expected_output_image_shape), ) + # For quantization-based processors, use absolute tolerance only to avoid infinity issues + @require_vision + @require_torch def test_slow_fast_equivalence(self): - if self.fast_image_processing_class is None: - self.skipTest("Fast image processing class not available") + if not self.test_slow_image_processor or not self.test_fast_image_processor: + self.skipTest(reason="Skipping slow/fast equivalence test") - for image_processing_class in self.image_processor_list: - # create random numpy tensors - image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True) - for image in image_inputs: - self.assertIsInstance(image, np.ndarray) + if self.image_processing_class is None or self.fast_image_processing_class is None: + self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined") - # Test not batched input - encoding_slow = self.image_processing_class(**self.image_processor_dict)( - image_inputs[0], return_tensors="pt" - ) - encoding_fast = self.fast_image_processing_class(**self.image_processor_dict)( - image_inputs[0], return_tensors="pt" - ) - # Convert to float for mean calculation since input_ids are integers - slow_tensor = encoding_slow.input_ids.float() - fast_tensor = encoding_fast.input_ids.float() - # For quantization-based processors, use absolute tolerance only to avoid infinity issues - # when one value is 0 and the other is 1. The rtol=0 prevents relative tolerance calculation. - self._assert_slow_fast_tensors_equivalence(slow_tensor, fast_tensor, atol=1.0, rtol=0) + dummy_image = Image.open( + requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw + ) + image_processor_slow = self.image_processing_class(**self.image_processor_dict) + image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict) - # Test batched - encoding_slow = self.image_processing_class(**self.image_processor_dict)(image_inputs, return_tensors="pt") - encoding_fast = self.fast_image_processing_class(**self.image_processor_dict)( - image_inputs, return_tensors="pt" - ) - # Convert to float for mean calculation since input_ids are integers - slow_tensor = encoding_slow.input_ids.float() - fast_tensor = encoding_fast.input_ids.float() - # Once again using absolute tolerance only to avoid infinity issues - self._assert_slow_fast_tensors_equivalence(slow_tensor, fast_tensor, atol=1.0, rtol=0) + encoding_slow = image_processor_slow(dummy_image, return_tensors="pt") + encoding_fast = image_processor_fast(dummy_image, return_tensors="pt") + self._assert_slow_fast_tensors_equivalence( + encoding_slow.input_ids.float(), encoding_fast.input_ids.float(), atol=1.0, rtol=0 + ) + @require_vision + @require_torch def test_slow_fast_equivalence_batched(self): - if self.fast_image_processing_class is None: - self.skipTest("Fast image processing class not available") + if not self.test_slow_image_processor or not self.test_fast_image_processor: + self.skipTest(reason="Skipping slow/fast equivalence test") - for image_processing_class in self.image_processor_list: - # create random numpy tensors - image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True) - for image in image_inputs: - self.assertIsInstance(image, np.ndarray) + if self.image_processing_class is None or self.fast_image_processing_class is None: + self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined") - # Test batched - encoding_slow = self.image_processing_class(**self.image_processor_dict)(image_inputs, return_tensors="pt") - encoding_fast = self.fast_image_processing_class(**self.image_processor_dict)( - image_inputs, return_tensors="pt" + if hasattr(self.image_processor_tester, "do_center_crop") and self.image_processor_tester.do_center_crop: + self.skipTest( + reason="Skipping as do_center_crop is True and center_crop functions are not equivalent for fast and slow processors" ) - # Convert to float for mean calculation since input_ids are integers - slow_tensor = encoding_slow.input_ids.float() - fast_tensor = encoding_fast.input_ids.float() - # For quantization-based processors, use absolute tolerance only to avoid infinity issues - # when one value is 0 and the other is 1. The rtol=0 prevents relative tolerance calculation. - self._assert_slow_fast_tensors_equivalence(slow_tensor, fast_tensor, atol=1.0, rtol=0) + + dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True) + image_processor_slow = self.image_processing_class(**self.image_processor_dict) + image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict) + + encoding_slow = image_processor_slow(dummy_images, return_tensors="pt") + encoding_fast = image_processor_fast(dummy_images, return_tensors="pt") + + self._assert_slow_fast_tensors_equivalence( + encoding_slow.input_ids.float(), encoding_fast.input_ids.float(), atol=1.0, rtol=0 + ) + + @slow + @require_torch_accelerator + @require_vision + @pytest.mark.torch_compile_test + def test_can_compile_fast_image_processor(self): + if self.fast_image_processing_class is None: + self.skipTest("Skipping compilation test as fast image processor is not defined") + if version.parse(torch.__version__) < version.parse("2.3"): + self.skipTest(reason="This test requires torch >= 2.3 to run.") + + torch.compiler.reset() + input_image = torch.randint(0, 255, (3, 224, 224), dtype=torch.uint8) + image_processor = self.fast_image_processing_class(**self.image_processor_dict) + output_eager = image_processor(input_image, device=torch_device, return_tensors="pt") + + image_processor = torch.compile(image_processor, mode="reduce-overhead") + output_compiled = image_processor(input_image, device=torch_device, return_tensors="pt") + self._assert_slow_fast_tensors_equivalence( + output_eager.input_ids.float(), output_compiled.input_ids.float(), atol=1.0, rtol=0 + ) def prepare_images():