From aea65470dcb19576e9ac8e929f3ecb34cb1af794 Mon Sep 17 00:00:00 2001
From: agamjots <email.agamjotsingh@gmail.com>
Date: Tue, 22 Jul 2025 11:53:56 -0700
Subject: [PATCH 01/33] initial commit

---
 .../models/imagegpt/image_processing_imagegpt_fast.py            | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 src/transformers/models/imagegpt/image_processing_imagegpt_fast.py

diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
new file mode 100644
index 000000000000..c23659b3d5a2
--- /dev/null
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
@@ -0,0 +1 @@
+#Will add fast image processing for imagegpt
\ No newline at end of file

From d49fa73af4e3e5e3230ae2f639c9d129851498f4 Mon Sep 17 00:00:00 2001
From: agamjots <email.agamjotsingh@gmail.com>
Date: Tue, 22 Jul 2025 12:38:44 -0700
Subject: [PATCH 02/33] initial setup

---
 docs/source/en/model_doc/imagegpt.md          |  5 ++
 .../models/auto/image_processing_auto.py      |  2 +-
 src/transformers/models/imagegpt/__init__.py  |  1 +
 .../image_processing_imagegpt_fast.py         | 48 ++++++++++++++++++-
 .../test_image_processing_imagegpt.py         |  6 ++-
 5 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/docs/source/en/model_doc/imagegpt.md b/docs/source/en/model_doc/imagegpt.md
index 7fbec62d30bb..ac6b664d439c 100644
--- a/docs/source/en/model_doc/imagegpt.md
+++ b/docs/source/en/model_doc/imagegpt.md
@@ -103,6 +103,11 @@ If you're interested in submitting a resource to be included here, please feel f
 [[autodoc]] ImageGPTImageProcessor
     - preprocess
 
+## ImageGPTImageProcessorFast
+
+[[autodoc]] ImageGPTImageProcessorFast
+    - preprocess
+
 ## ImageGPTModel
 
 [[autodoc]] ImageGPTModel
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index 775d94b25b91..cb664bb1040e 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -104,7 +104,7 @@
             ("idefics2", ("Idefics2ImageProcessor", "Idefics2ImageProcessorFast")),
             ("idefics3", ("Idefics3ImageProcessor", "Idefics3ImageProcessorFast")),
             ("ijepa", ("ViTImageProcessor", "ViTImageProcessorFast")),
-            ("imagegpt", ("ImageGPTImageProcessor",)),
+            ("imagegpt", ("ImageGPTImageProcessor", "ImageGPTImageProcessorFast")),
             ("instructblip", ("BlipImageProcessor", "BlipImageProcessorFast")),
             ("instructblipvideo", ("InstructBlipVideoImageProcessor",)),
             ("janus", ("JanusImageProcessor")),
diff --git a/src/transformers/models/imagegpt/__init__.py b/src/transformers/models/imagegpt/__init__.py
index cb79cea50d6e..098ffb6296f5 100644
--- a/src/transformers/models/imagegpt/__init__.py
+++ b/src/transformers/models/imagegpt/__init__.py
@@ -21,6 +21,7 @@
     from .configuration_imagegpt import *
     from .feature_extraction_imagegpt import *
     from .image_processing_imagegpt import *
+    from .image_processing_imagegpt_fast import *
     from .modeling_imagegpt import *
 else:
     import sys
diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
index c23659b3d5a2..8b823e5675a9 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
@@ -1 +1,47 @@
-#Will add fast image processing for imagegpt
\ No newline at end of file
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for ImageGPT."""
+
+from ...image_processing_utils_fast import BaseImageProcessorFast
+from ...image_utils import PILImageResampling
+from ...utils import auto_docstring
+
+
+@auto_docstring
+class ImageGPTImageProcessorFast(BaseImageProcessorFast):
+    # This generated class can be used as a starting point for the fast image processor.
+    # if the image processor is only used for simple augmentations, such as resizing, center cropping, rescaling, or normalizing,
+    # only the default values should be set in the class.
+    # If the image processor requires more complex augmentations, methods from BaseImageProcessorFast can be overridden.
+    # In most cases, only the `_preprocess` method should be overridden.
+
+    # For an example of a fast image processor requiring more complex augmentations, see `LlavaNextImageProcessorFast`.
+
+    # Default values should be checked against the slow image processor
+    # None values left after checking can be removed
+    resample = PILImageResampling.BILINEAR
+    image_mean = None
+    image_std = None
+    size = {"height": 256, "width": 256}
+    default_to_square = None
+    crop_size = None
+    do_resize = True
+    do_center_crop = None
+    do_rescale = None
+    do_normalize = True
+    do_convert_rgb = None
+
+
+__all__ = ["ImageGPTImageProcessorFast"]
diff --git a/tests/models/imagegpt/test_image_processing_imagegpt.py b/tests/models/imagegpt/test_image_processing_imagegpt.py
index de29b8e29fbd..db26154a562c 100644
--- a/tests/models/imagegpt/test_image_processing_imagegpt.py
+++ b/tests/models/imagegpt/test_image_processing_imagegpt.py
@@ -23,7 +23,7 @@
 
 from transformers import AutoImageProcessor
 from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_vision, slow
-from transformers.utils import is_torch_available, is_vision_available
+from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
 
 from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
 
@@ -36,6 +36,9 @@
 
     from transformers import ImageGPTImageProcessor
 
+    if is_torchvision_available():
+        from transformers import ImageGPTImageProcessorFast
+
 
 class ImageGPTImageProcessingTester:
     def __init__(
@@ -94,6 +97,7 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F
 @require_vision
 class ImageGPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = ImageGPTImageProcessor if is_vision_available() else None
+    fast_image_processing_class = ImageGPTImageProcessorFast if is_torchvision_available() else None
 
     def setUp(self):
         super().setUp()

From aead2217fbe9559f60ec552f4ec0bc4eaabc1c6b Mon Sep 17 00:00:00 2001
From: agamjots <email.agamjotsingh@gmail.com>
Date: Fri, 25 Jul 2025 00:42:38 -0700
Subject: [PATCH 03/33] Overiding imageGPT specific functions

---
 .../imagegpt/image_processing_imagegpt_fast.py    | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
index 8b823e5675a9..69ca7ff3447b 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
@@ -14,11 +14,26 @@
 # limitations under the License.
 """Fast Image processor class for ImageGPT."""
 
+import numpy as np
+from typing import Dict, List, Optional, Tuple, Union
+
 from ...image_processing_utils_fast import BaseImageProcessorFast
 from ...image_utils import PILImageResampling
 from ...utils import auto_docstring
 
+def squared_euclidean_distance_fast(a, b):
+    b = b.T
+    a2 = torch.sum(a ** 2, dim = 1)
+    b2 = torch.sum(b ** 2, dim = 0)
+    ab = torch.matmul(a, b)
+    d = a2[:, None] - 2 * ab + b2[None, :]
+    return d
+
 
+def color_quantize_fast(x, clusters):
+    x = x.reshape(-1, 3)
+    d = squared_euclidean_distance_fast(x, clusters)
+    return np.argmin(d, axis=1)
 @auto_docstring
 class ImageGPTImageProcessorFast(BaseImageProcessorFast):
     # This generated class can be used as a starting point for the fast image processor.

From dc1b1910fbc5885dd367015d28dedf68f234c1b8 Mon Sep 17 00:00:00 2001
From: Ethan Ayaay <ayaayethan@gmail.com>
Date: Mon, 28 Jul 2025 11:57:58 -0700
Subject: [PATCH 04/33] imported is_torch_available and utilized it for
 importing torch in imageGPT fast

---
 .../image_processing_imagegpt_fast.py         | 35 +++++++++++++------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
index 69ca7ff3447b..e3c94fcd6fc4 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
@@ -19,7 +19,13 @@
 
 from ...image_processing_utils_fast import BaseImageProcessorFast
 from ...image_utils import PILImageResampling
-from ...utils import auto_docstring
+from ...utils import (
+    auto_docstring,
+    is_torch_available
+)
+
+if is_torch_available():
+    import torch
 
 def squared_euclidean_distance_fast(a, b):
     b = b.T
@@ -29,11 +35,11 @@ def squared_euclidean_distance_fast(a, b):
     d = a2[:, None] - 2 * ab + b2[None, :]
     return d
 
-
 def color_quantize_fast(x, clusters):
     x = x.reshape(-1, 3)
     d = squared_euclidean_distance_fast(x, clusters)
     return np.argmin(d, axis=1)
+
 @auto_docstring
 class ImageGPTImageProcessorFast(BaseImageProcessorFast):
     # This generated class can be used as a starting point for the fast image processor.
@@ -47,16 +53,25 @@ class ImageGPTImageProcessorFast(BaseImageProcessorFast):
     # Default values should be checked against the slow image processor
     # None values left after checking can be removed
     resample = PILImageResampling.BILINEAR
-    image_mean = None
-    image_std = None
-    size = {"height": 256, "width": 256}
-    default_to_square = None
-    crop_size = None
+    size = {"height": 256, "width": 256} # import get_size_dict?
     do_resize = True
-    do_center_crop = None
-    do_rescale = None
     do_normalize = True
-    do_convert_rgb = None
+    # need:
+        # clusters, resample, do_color_quantize
+
+    # initialize these arguments, pass it into super constructor
+
+    # not in base:
+    image_mean = None # not in base, normalize uses a constant factor to divide pixel values
+    image_std = None # not in base, normalize uses a constant factor to divide pixel values
+    default_to_square = None # not in base
+    crop_size = None # not in base
+    do_center_crop = None # not in base
+    do_rescale = None # not in base
+    do_convert_rgb = None # not in base
+
+# preprocessor has additional kwargs:
+    # images, return_tensors, data_format, input_data_format
 
 
 __all__ = ["ImageGPTImageProcessorFast"]

From daedee9aa6327966f4635f5645aec1a663229a18 Mon Sep 17 00:00:00 2001
From: Ethan Ayaay <ayaayethan@gmail.com>
Date: Mon, 28 Jul 2025 12:26:16 -0700
Subject: [PATCH 05/33] Created init and ImageGPTFastImageProcessorKwargs

---
 .../image_processing_imagegpt_fast.py         | 20 ++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
index e3c94fcd6fc4..938df44936b5 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
@@ -17,7 +17,11 @@
 import numpy as np
 from typing import Dict, List, Optional, Tuple, Union
 
-from ...image_processing_utils_fast import BaseImageProcessorFast
+from ...image_processing_utils_fast import (
+  BaseImageProcessorFast,
+  DefaultFastImageProcessorKwargs
+)
+from ...processing_utils import Unpack
 from ...image_utils import PILImageResampling
 from ...utils import (
     auto_docstring,
@@ -40,6 +44,11 @@ def color_quantize_fast(x, clusters):
     d = squared_euclidean_distance_fast(x, clusters)
     return np.argmin(d, axis=1)
 
+class ImageGPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    do_color_quantize: Optional[bool] = True
+    clusters: Optional[np.ndarray] = None
+    resample: Optional[PILImageResampling] = PILImageResampling.BILINEAR
+
 @auto_docstring
 class ImageGPTImageProcessorFast(BaseImageProcessorFast):
     # This generated class can be used as a starting point for the fast image processor.
@@ -56,10 +65,15 @@ class ImageGPTImageProcessorFast(BaseImageProcessorFast):
     size = {"height": 256, "width": 256} # import get_size_dict?
     do_resize = True
     do_normalize = True
-    # need:
-        # clusters, resample, do_color_quantize
+
+    # Specific Kwargs
+    do_color_quantize = True
+    clusters = None
+    resample = PILImageResampling.BILINEAR
 
     # initialize these arguments, pass it into super constructor
+    def __init__(self, **kwargs: Unpack[ImageGPTFastImageProcessorKwargs]):
+        super().__init__(**kwargs)
 
     # not in base:
     image_mean = None # not in base, normalize uses a constant factor to divide pixel values

From 8608e19e8b995149e5794948dee62226782e2671 Mon Sep 17 00:00:00 2001
From: Ethan Ayaay <ayaayethan@gmail.com>
Date: Mon, 28 Jul 2025 12:36:47 -0700
Subject: [PATCH 06/33] added return_tensors, data_format, and
 input_data_format to ImageGPTFastImageProcessorKwargs

---
 .../image_processing_imagegpt_fast.py         | 35 ++++++++++++++-----
 1 file changed, 26 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
index 938df44936b5..344521438b20 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
@@ -19,13 +19,19 @@
 
 from ...image_processing_utils_fast import (
   BaseImageProcessorFast,
-  DefaultFastImageProcessorKwargs
+  DefaultFastImageProcessorKwargs,
+  BatchFeature
 )
 from ...processing_utils import Unpack
-from ...image_utils import PILImageResampling
+from ...image_utils import (
+  PILImageResampling,
+  ImageInput,
+  ChannelDimension
+)
 from ...utils import (
     auto_docstring,
-    is_torch_available
+    is_torch_available,
+    TensorType
 )
 
 if is_torch_available():
@@ -48,6 +54,9 @@ class ImageGPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     do_color_quantize: Optional[bool] = True
     clusters: Optional[np.ndarray] = None
     resample: Optional[PILImageResampling] = PILImageResampling.BILINEAR
+    return_tensors: Optional[Union[str, TensorType]] = None,
+    data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
 
 @auto_docstring
 class ImageGPTImageProcessorFast(BaseImageProcessorFast):
@@ -71,10 +80,6 @@ class ImageGPTImageProcessorFast(BaseImageProcessorFast):
     clusters = None
     resample = PILImageResampling.BILINEAR
 
-    # initialize these arguments, pass it into super constructor
-    def __init__(self, **kwargs: Unpack[ImageGPTFastImageProcessorKwargs]):
-        super().__init__(**kwargs)
-
     # not in base:
     image_mean = None # not in base, normalize uses a constant factor to divide pixel values
     image_std = None # not in base, normalize uses a constant factor to divide pixel values
@@ -84,8 +89,20 @@ def __init__(self, **kwargs: Unpack[ImageGPTFastImageProcessorKwargs]):
     do_rescale = None # not in base
     do_convert_rgb = None # not in base
 
-# preprocessor has additional kwargs:
-    # images, return_tensors, data_format, input_data_format
+    # initialize these arguments, pass it into super constructor
+    def __init__(self, **kwargs: Unpack[ImageGPTFastImageProcessorKwargs]):
+        super().__init__(**kwargs)
+
+    # _preprocessor has additional kwargs:
+        # images, return_tensors, data_format, input_data_format
+
+    # PUBLIC preprocess:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[ImageGPTFastImageProcessorKwargs]) -> BatchFeature:
+        return super().preprocess(images, **kwargs)
 
+    # PRIVATE preprocess:
+    def _preprocess(self):
+        # TODO: Override
+        pass
 
 __all__ = ["ImageGPTImageProcessorFast"]

From b772356022d71ab2f771ee39e1a301d6d2b3ded4 Mon Sep 17 00:00:00 2001
From: Ethan Ayaay <ayaayethan@gmail.com>
Date: Mon, 28 Jul 2025 12:45:05 -0700
Subject: [PATCH 07/33] set up arguments and process and _preprocess
 definitions

---
 .../models/imagegpt/image_processing_imagegpt_fast.py        | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
index 344521438b20..88f852b2d03c 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
@@ -56,7 +56,7 @@ class ImageGPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     resample: Optional[PILImageResampling] = PILImageResampling.BILINEAR
     return_tensors: Optional[Union[str, TensorType]] = None,
     data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST,
-    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None
 
 @auto_docstring
 class ImageGPTImageProcessorFast(BaseImageProcessorFast):
@@ -80,7 +80,7 @@ class ImageGPTImageProcessorFast(BaseImageProcessorFast):
     clusters = None
     resample = PILImageResampling.BILINEAR
 
-    # not in base:
+    # not in base ##########
     image_mean = None # not in base, normalize uses a constant factor to divide pixel values
     image_std = None # not in base, normalize uses a constant factor to divide pixel values
     default_to_square = None # not in base
@@ -88,6 +88,7 @@ class ImageGPTImageProcessorFast(BaseImageProcessorFast):
     do_center_crop = None # not in base
     do_rescale = None # not in base
     do_convert_rgb = None # not in base
+    ############
 
     # initialize these arguments, pass it into super constructor
     def __init__(self, **kwargs: Unpack[ImageGPTFastImageProcessorKwargs]):

From 9e80e0ac0aa849da534cb99c6c254f2565361311 Mon Sep 17 00:00:00 2001
From: chris <christine05789@gmail.com>
Date: Fri, 1 Aug 2025 11:38:11 -0700
Subject: [PATCH 08/33] Added arguments to _preprocess

---
 .../image_processing_imagegpt_fast.py         | 24 ++++++++++++++-----
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
index 88f852b2d03c..3297f5fb4954 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Fast Image processor class for ImageGPT."""
-
+import PIL
 import numpy as np
 from typing import Dict, List, Optional, Tuple, Union
 
@@ -24,9 +24,9 @@
 )
 from ...processing_utils import Unpack
 from ...image_utils import (
-  PILImageResampling,
-  ImageInput,
-  ChannelDimension
+    PILImageResampling,
+    ImageInput,
+    ChannelDimension, SizeDict
 )
 from ...utils import (
     auto_docstring,
@@ -71,7 +71,7 @@ class ImageGPTImageProcessorFast(BaseImageProcessorFast):
     # Default values should be checked against the slow image processor
     # None values left after checking can be removed
     resample = PILImageResampling.BILINEAR
-    size = {"height": 256, "width": 256} # import get_size_dict?
+    size = {"height": 256, "width": 256} # import get_size_dict?, can be overridden in preprocess
     do_resize = True
     do_normalize = True
 
@@ -102,8 +102,20 @@ def preprocess(self, images: ImageInput, **kwargs: Unpack[ImageGPTFastImageProce
         return super().preprocess(images, **kwargs)
 
     # PRIVATE preprocess:
-    def _preprocess(self):
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        do_normalize: bool,
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
+    ) -> BatchFeature:
         # TODO: Override
+        # Resize to specific size
+        # Normalize pixel values
+        # Optionally color quantize into clusters
+        # Return processed images in a specified tensor format
         pass
 
 __all__ = ["ImageGPTImageProcessorFast"]

From f9f3ad8cd057585e372cede7376251f6f7d194f4 Mon Sep 17 00:00:00 2001
From: chris <christine05789@gmail.com>
Date: Fri, 1 Aug 2025 12:23:15 -0700
Subject: [PATCH 09/33] Added additional optional arguments

---
 .../image_processing_imagegpt_fast.py         | 46 +++++++++++++++++--
 1 file changed, 41 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
index 3297f5fb4954..0c625a358304 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
@@ -17,16 +17,19 @@
 import numpy as np
 from typing import Dict, List, Optional, Tuple, Union
 
+from ..mllama.image_processing_mllama import to_channel_dimension_format
+from ...image_processing_utils import get_size_dict
 from ...image_processing_utils_fast import (
-  BaseImageProcessorFast,
-  DefaultFastImageProcessorKwargs,
-  BatchFeature
+    BaseImageProcessorFast,
+    DefaultFastImageProcessorKwargs,
+    BatchFeature, logger
 )
 from ...processing_utils import Unpack
 from ...image_utils import (
     PILImageResampling,
     ImageInput,
-    ChannelDimension, SizeDict
+    ChannelDimension, SizeDict, make_list_of_images, valid_images, validate_preprocess_arguments, is_scaled_image,
+    infer_channel_dimension_format
 )
 from ...utils import (
     auto_docstring,
@@ -107,15 +110,48 @@ def _preprocess(
         images: list["torch.Tensor"],
         do_resize: bool,
         size: SizeDict,
+        resample: PILImageResampling,
         do_normalize: bool,
+        do_color_quantize: Optional[bool],
+        clusters: Optional[Union[list[list[int]], np.ndarray]],
         return_tensors: Optional[Union[str, TensorType]],
+        data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
     ) -> BatchFeature:
+
         # TODO: Override
         # Resize to specific size
         # Normalize pixel values
         # Optionally color quantize into clusters
         # Return processed images in a specified tensor format
-        pass
+
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        resample = resample if resample is not None else self.resample
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        do_color_quantize = do_color_quantize if do_color_quantize is not None else self.do_color_quantize
+        clusters = clusters if clusters is not None else self.clusters
+        clusters = np.array(clusters)
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        # Here, normalize() is using a constant factor to divide pixel values.
+        # hence, the method does not need iamge_mean and image_std.
+        validate_preprocess_arguments(
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        if do_color_quantize and clusters is None:
+            raise ValueError("Clusters must be specified if do_color_quantize is True.")
+
 
 __all__ = ["ImageGPTImageProcessorFast"]

From 870cd9ade501dd06183bda2259ece381eb17158f Mon Sep 17 00:00:00 2001
From: Ethan Ayaay <ayaayethan@gmail.com>
Date: Fri, 1 Aug 2025 13:01:06 -0700
Subject: [PATCH 10/33] Copied logic over from base imageGPT processor

---
 .../image_processing_imagegpt_fast.py         | 65 ++++++++++++++++---
 1 file changed, 55 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
index 0c625a358304..5993a0e2bcbd 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
@@ -17,8 +17,8 @@
 import numpy as np
 from typing import Dict, List, Optional, Tuple, Union
 
-from ..mllama.image_processing_mllama import to_channel_dimension_format
-from ...image_processing_utils import get_size_dict
+from transformers.image_transforms import to_channel_dimension_format
+
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
     DefaultFastImageProcessorKwargs,
@@ -28,8 +28,8 @@
 from ...image_utils import (
     PILImageResampling,
     ImageInput,
-    ChannelDimension, SizeDict, make_list_of_images, valid_images, validate_preprocess_arguments, is_scaled_image,
-    infer_channel_dimension_format
+    ChannelDimension, SizeDict,
+    infer_channel_dimension_format, make_list_of_images, valid_images, validate_preprocess_arguments, is_scaled_image,
 )
 from ...utils import (
     auto_docstring,
@@ -54,6 +54,7 @@ def color_quantize_fast(x, clusters):
     return np.argmin(d, axis=1)
 
 class ImageGPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    # TODO: Add documentation for each argument
     do_color_quantize: Optional[bool] = True
     clusters: Optional[np.ndarray] = None
     resample: Optional[PILImageResampling] = PILImageResampling.BILINEAR
@@ -120,12 +121,6 @@ def _preprocess(
         **kwargs,
     ) -> BatchFeature:
 
-        # TODO: Override
-        # Resize to specific size
-        # Normalize pixel values
-        # Optionally color quantize into clusters
-        # Return processed images in a specified tensor format
-
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
         resample = resample if resample is not None else self.resample
@@ -154,4 +149,54 @@ def _preprocess(
             raise ValueError("Clusters must be specified if do_color_quantize is True.")
 
 
+        # TODO:
+
+        # Resize to specific size
+
+        # Normalize pixel values
+
+        # Optionally color quantize into clusters
+
+        # Return processed images in a specified tensor format
+
+        if do_normalize and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If you wish to do this, "
+                "make sure to set `do_normalize` to `False` and that pixel values are between [-1, 1].",
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        if do_resize:
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_normalize:
+            images = [self.normalize(image=image, input_data_format=input_data_format) for image in images]
+
+        if do_color_quantize:
+            images = [to_channel_dimension_format(image, ChannelDimension.LAST, input_data_format) for image in images]
+            # color quantize from (batch_size, height, width, 3) to (batch_size, height, width)
+            images = np.array(images)
+            images = color_quantize_fast(images, clusters).reshape(images.shape[:-1])
+
+            # flatten to (batch_size, height*width)
+            batch_size = images.shape[0]
+            images = images.reshape(batch_size, -1)
+
+            # We need to convert back to a list of images to keep consistent behaviour across processors.
+            images = list(images)
+        else:
+            images = [
+                to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+                for image in images
+            ]
+
+        data = {"input_ids": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
 __all__ = ["ImageGPTImageProcessorFast"]

From 3604c7a15908f722eb2bf58f725840816c33d424 Mon Sep 17 00:00:00 2001
From: Ethan Ayaay <ayaayethan@gmail.com>
Date: Tue, 5 Aug 2025 12:58:09 -0700
Subject: [PATCH 11/33] Implemented 2nd draft of fast imageGPT preprocess using
 batch processing

---
 .../image_processing_imagegpt_fast.py         | 122 +++++++++---------
 1 file changed, 59 insertions(+), 63 deletions(-)

diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
index 5993a0e2bcbd..7649995b5fd1 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
@@ -22,7 +22,8 @@
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
     DefaultFastImageProcessorKwargs,
-    BatchFeature, logger
+    BatchFeature, logger,
+    group_images_by_shape, reorder_images
 )
 from ...processing_utils import Unpack
 from ...image_utils import (
@@ -111,92 +112,87 @@ def _preprocess(
         images: list["torch.Tensor"],
         do_resize: bool,
         size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
         resample: PILImageResampling,
         do_normalize: bool,
         do_color_quantize: Optional[bool],
         clusters: Optional[Union[list[list[int]], np.ndarray]],
         return_tensors: Optional[Union[str, TensorType]],
+        disable_grouping: Optional[bool],
         data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
         **kwargs,
     ) -> BatchFeature:
-
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
         resample = resample if resample is not None else self.resample
         do_normalize = do_normalize if do_normalize is not None else self.do_normalize
         do_color_quantize = do_color_quantize if do_color_quantize is not None else self.do_color_quantize
         clusters = clusters if clusters is not None else self.clusters
-        clusters = np.array(clusters)
-
-        images = make_list_of_images(images)
-
-        if not valid_images(images):
-            raise ValueError(
-                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
-                "torch.Tensor, tf.Tensor or jax.ndarray."
-            )
-
-        # Here, normalize() is using a constant factor to divide pixel values.
-        # hence, the method does not need iamge_mean and image_std.
-        validate_preprocess_arguments(
-            do_resize=do_resize,
-            size=size,
-            resample=resample,
-        )
 
+        # 1. Setup. Validate ImageGPT-specific requirements
+        # Check for do_color_quantize and clusters.
         if do_color_quantize and clusters is None:
             raise ValueError("Clusters must be specified if do_color_quantize is True.")
 
+        # Clusters come in np arrays. Convert to torch tensors.
+        if clusters is not None:
+            cluster_tensors = torch.tensor(clusters, dtype=torch.float32)
+        if images[0].is_cuda:
+            # if image is stored on a CUDA GPA, convert tensors to CUDA
+            cluster_tensors = cluster_tensors.cuda()
+
+        # 2. Group images into batches of the same shape for more efficient processing.
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        unordered_processed_images = {}
+
+        # Loop through shapes and stacked images
+        for shape, stacked_images in grouped_images.items():
+            # Resize to specific sizes (if do_resize is specified)
+            if do_resize:
+                stacked_images = self.resize(
+                    image=stacked_images,
+                    size=size,
+                    interpolation=interpolation
+                )
+
+            # Normalize pixel values (if do_normalize is specified)
+            if do_normalize:
+                stacked_images = stacked_images.float()
+                stacked_images = (stacked_images / 127.5) - 1.0
+
+            unordered_processed_images[shape] = stacked_images
+
+        # 3. Reorder and maintain original image order after processing into batches
+        processed_images = reorder_images(unordered_processed_images, grouped_images_index)
+
+        # 4. Color quantize if specified
+        if do_color_quantize:
+            quantized_images = []
+            for image in processed_images:
+                # Convert CHW to HWC for quantization
+                image_hwc = image.permute(1, 2, 0)  # (H, W, C)
 
-        # TODO:
-
-        # Resize to specific size
-
-        # Normalize pixel values
-
-        # Optionally color quantize into clusters
-
-        # Return processed images in a specified tensor format
-
-        if do_normalize and is_scaled_image(images[0]):
-            logger.warning_once(
-                "It looks like you are trying to rescale already rescaled images. If you wish to do this, "
-                "make sure to set `do_normalize` to `False` and that pixel values are between [-1, 1].",
-            )
-
-        if input_data_format is None:
-            # We assume that all images have the same channel dimension format.
-            input_data_format = infer_channel_dimension_format(images[0])
-
-        if do_resize:
-            images = [
-                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
-                for image in images
-            ]
+                # Denormalize back to [0, 255] for quantization
+                image_hwc = (image_hwc + 1.0) * 127.5
+                image_hwc = torch.clamp(image_hwc, 0, 255)
 
-        if do_normalize:
-            images = [self.normalize(image=image, input_data_format=input_data_format) for image in images]
+                # Fast torch-based color quantization
+                quantized = self._color_quantize_torch(image_hwc, cluster_tensors)
 
-        if do_color_quantize:
-            images = [to_channel_dimension_format(image, ChannelDimension.LAST, input_data_format) for image in images]
-            # color quantize from (batch_size, height, width, 3) to (batch_size, height, width)
-            images = np.array(images)
-            images = color_quantize_fast(images, clusters).reshape(images.shape[:-1])
+                # Flatten to sequence (H*W,)
+                quantized_flat = quantized.view(-1)
+                quantized_images.append(quantized_flat)
 
-            # flatten to (batch_size, height*width)
-            batch_size = images.shape[0]
-            images = images.reshape(batch_size, -1)
+            # Stack all quantized sequences
+            input_ids = torch.stack(quantized_images, dim=0)
 
-            # We need to convert back to a list of images to keep consistent behaviour across processors.
-            images = list(images)
+            return BatchFeature(data={"input_ids": input_ids}, tensor_type=return_tensors)
         else:
-            images = [
-                to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
-                for image in images
-            ]
-
-        data = {"input_ids": images}
-        return BatchFeature(data=data, tensor_type=return_tensors)
+        # 5. Standard output without quantizing
+            pixel_values = torch.stack(processed_images, dim=0)
+            return BatchFeature(data={"pixel_values": pixel_values}, tensor_type=return_tensors)
 
 __all__ = ["ImageGPTImageProcessorFast"]

From fd5c1362b2d47471431c9b9e74f6eb6d66445a84 Mon Sep 17 00:00:00 2001
From: Ethan Ayaay <ayaayethan@gmail.com>
Date: Tue, 5 Aug 2025 13:37:30 -0700
Subject: [PATCH 12/33] Implemented 3rd draft of imageGPT fast _preprocessor.
 Pulled logic from BaseImageProcessorFast

---
 .../image_processing_imagegpt_fast.py         | 70 +++++++++++++------
 1 file changed, 48 insertions(+), 22 deletions(-)

diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
index 7649995b5fd1..abd548a5a2ab 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
@@ -35,11 +35,14 @@
 from ...utils import (
     auto_docstring,
     is_torch_available,
+    is_torchvision_available,
     TensorType
 )
 
 if is_torch_available():
     import torch
+    if is_torchvision_available():
+        from torchvision.transforms import functional as F
 
 def squared_euclidean_distance_fast(a, b):
     b = b.T
@@ -52,7 +55,7 @@ def squared_euclidean_distance_fast(a, b):
 def color_quantize_fast(x, clusters):
     x = x.reshape(-1, 3)
     d = squared_euclidean_distance_fast(x, clusters)
-    return np.argmin(d, axis=1)
+    return torch.argmin(d, dim=1)
 
 class ImageGPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     # TODO: Add documentation for each argument
@@ -86,8 +89,8 @@ class ImageGPTImageProcessorFast(BaseImageProcessorFast):
     resample = PILImageResampling.BILINEAR
 
     # not in base ##########
-    image_mean = None # not in base, normalize uses a constant factor to divide pixel values
-    image_std = None # not in base, normalize uses a constant factor to divide pixel values
+    image_mean = [0.5, 0.5, 0.5] # not in base, normalize uses a constant factor to divide pixel values
+    image_std = [0.5, 0.5, 0.5] # not in base, normalize uses a constant factor to divide pixel values
     default_to_square = None # not in base
     crop_size = None # not in base
     do_center_crop = None # not in base
@@ -113,16 +116,20 @@ def _preprocess(
         do_resize: bool,
         size: SizeDict,
         interpolation: Optional["F.InterpolationMode"],
-        resample: PILImageResampling,
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
         do_normalize: bool,
-        do_color_quantize: Optional[bool],
-        clusters: Optional[Union[list[list[int]], np.ndarray]],
-        return_tensors: Optional[Union[str, TensorType]],
-        disable_grouping: Optional[bool],
-        data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
         image_mean: Optional[Union[float, list[float]]] = None,
         image_std: Optional[Union[float, list[float]]] = None,
+        disable_grouping: Optional[bool] = False,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_color_quantize: Optional[bool] = None,
+        clusters: Optional[Union[list[list[int]], np.ndarray]] = None,
+        data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
     ) -> BatchFeature:
         do_resize = do_resize if do_resize is not None else self.do_resize
@@ -131,6 +138,8 @@ def _preprocess(
         do_normalize = do_normalize if do_normalize is not None else self.do_normalize
         do_color_quantize = do_color_quantize if do_color_quantize is not None else self.do_color_quantize
         clusters = clusters if clusters is not None else self.clusters
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
 
         # 1. Setup. Validate ImageGPT-specific requirements
         # Check for do_color_quantize and clusters.
@@ -138,16 +147,16 @@ def _preprocess(
             raise ValueError("Clusters must be specified if do_color_quantize is True.")
 
         # Clusters come in np arrays. Convert to torch tensors.
+        cluster_tensors = None
         if clusters is not None:
             cluster_tensors = torch.tensor(clusters, dtype=torch.float32)
-        if images[0].is_cuda:
-            # if image is stored on a CUDA GPA, convert tensors to CUDA
-            cluster_tensors = cluster_tensors.cuda()
+            if len(images) > 0 and images[0].is_cuda:
+                # if image is stored on a CUDA GPA, convert tensors to CUDA
+                cluster_tensors = cluster_tensors.cuda()
 
         # 2. Group images into batches of the same shape for more efficient processing.
         grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
-        unordered_processed_images = {}
-
+        resized_images_grouped = {}
         # Loop through shapes and stacked images
         for shape, stacked_images in grouped_images.items():
             # Resize to specific sizes (if do_resize is specified)
@@ -157,16 +166,33 @@ def _preprocess(
                     size=size,
                     interpolation=interpolation
                 )
-
             # Normalize pixel values (if do_normalize is specified)
             if do_normalize:
-                stacked_images = stacked_images.float()
-                stacked_images = (stacked_images / 127.5) - 1.0
-
-            unordered_processed_images[shape] = stacked_images
+                stacked_images = self.normalize(
+                    image=stacked_images,
+                    mean=image_mean,
+                    std=image_std
+                )
+            resized_images_grouped[shape] = stacked_images
 
         # 3. Reorder and maintain original image order after processing into batches
-        processed_images = reorder_images(unordered_processed_images, grouped_images_index)
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_center_crop:
+                stacked_images = self.center_crop(stacked_images, crop_size)
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_images_grouped[shape] = stacked_images
+
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
 
         # 4. Color quantize if specified
         if do_color_quantize:
@@ -180,7 +206,7 @@ def _preprocess(
                 image_hwc = torch.clamp(image_hwc, 0, 255)
 
                 # Fast torch-based color quantization
-                quantized = self._color_quantize_torch(image_hwc, cluster_tensors)
+                quantized = color_quantize_fast(image_hwc, cluster_tensors)
 
                 # Flatten to sequence (H*W,)
                 quantized_flat = quantized.view(-1)

From ec1681bca5100d18f117962d70bb99325e27b346 Mon Sep 17 00:00:00 2001
From: Ethan Ayaay <ayaayethan@gmail.com>
Date: Thu, 7 Aug 2025 11:41:35 -0700
Subject: [PATCH 13/33] modified imageGPT test file to properly run fast
 processor tests

---
 .../models/imagegpt/test_image_processing_imagegpt.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tests/models/imagegpt/test_image_processing_imagegpt.py b/tests/models/imagegpt/test_image_processing_imagegpt.py
index db26154a562c..154440c2f134 100644
--- a/tests/models/imagegpt/test_image_processing_imagegpt.py
+++ b/tests/models/imagegpt/test_image_processing_imagegpt.py
@@ -108,11 +108,12 @@ def image_processor_dict(self):
         return self.image_processor_tester.prepare_image_processor_dict()
 
     def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "clusters"))
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "size"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        for image_processing_class in self.image_processors_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
+            self.assertTrue(hasattr(image_processing, "clusters"))
+            self.assertTrue(hasattr(image_processing, "do_resize"))
+            self.assertTrue(hasattr(image_processing, "size"))
+            self.assertTrue(hasattr(image_processing, "do_normalize"))
 
     def test_image_processor_from_dict_with_kwargs(self):
         image_processor = self.image_processing_class.from_dict(self.image_processor_dict)

From 432e8f3ba7804867af5f00460504f09abda10a05 Mon Sep 17 00:00:00 2001
From: Ethan Ayaay <ayaayethan@gmail.com>
Date: Thu, 7 Aug 2025 12:19:53 -0700
Subject: [PATCH 14/33] converts images to torch.float32 from torch.unit8

---
 .../models/imagegpt/image_processing_imagegpt_fast.py        | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
index abd548a5a2ab..78819fe2fa12 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
@@ -125,7 +125,7 @@ def _preprocess(
         image_std: Optional[Union[float, list[float]]] = None,
         disable_grouping: Optional[bool] = False,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        resample: Optional[PILImageResampling] = None,
+        resample: Optional[PILImageResampling] = PILImageResampling.BILINEAR,
         do_color_quantize: Optional[bool] = None,
         clusters: Optional[Union[list[list[int]], np.ndarray]] = None,
         data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST,
@@ -146,6 +146,9 @@ def _preprocess(
         if do_color_quantize and clusters is None:
             raise ValueError("Clusters must be specified if do_color_quantize is True.")
 
+        # Convert images to torch float32
+        images = [image.to(torch.float32) for image in images]
+
         # Clusters come in np arrays. Convert to torch tensors.
         cluster_tensors = None
         if clusters is not None:

From 040678c87153be28c278bf2c767c5519bc8b0568 Mon Sep 17 00:00:00 2001
From: Ethan Ayaay <ayaayethan@gmail.com>
Date: Thu, 7 Aug 2025 12:34:46 -0700
Subject: [PATCH 15/33] fixed a typo with self.image_processor_list in the
 imagegpt test file

---
 tests/models/imagegpt/test_image_processing_imagegpt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/imagegpt/test_image_processing_imagegpt.py b/tests/models/imagegpt/test_image_processing_imagegpt.py
index 154440c2f134..0936524cfa05 100644
--- a/tests/models/imagegpt/test_image_processing_imagegpt.py
+++ b/tests/models/imagegpt/test_image_processing_imagegpt.py
@@ -108,7 +108,7 @@ def image_processor_dict(self):
         return self.image_processor_tester.prepare_image_processor_dict()
 
     def test_image_processor_properties(self):
-        for image_processing_class in self.image_processors_list:
+        for image_processing_class in self.image_processor_list:
             image_processing = image_processing_class(**self.image_processor_dict)
             self.assertTrue(hasattr(image_processing, "clusters"))
             self.assertTrue(hasattr(image_processing, "do_resize"))

From 6e0c6703a3094a0487f5abfbfd846fd9b4f0eeb5 Mon Sep 17 00:00:00 2001
From: Ethan Ayaay <ayaayethan@gmail.com>
Date: Thu, 7 Aug 2025 12:38:35 -0700
Subject: [PATCH 16/33] updated more instances of image_processing =
 self.image_processing_class in the test file to test fast processor

---
 .../test_image_processing_imagegpt.py         | 109 +++++++++---------
 1 file changed, 56 insertions(+), 53 deletions(-)

diff --git a/tests/models/imagegpt/test_image_processing_imagegpt.py b/tests/models/imagegpt/test_image_processing_imagegpt.py
index 0936524cfa05..56eaf32895e8 100644
--- a/tests/models/imagegpt/test_image_processing_imagegpt.py
+++ b/tests/models/imagegpt/test_image_processing_imagegpt.py
@@ -186,43 +186,45 @@ def test_init_without_params(self):
 
     # Override the test from ImageProcessingTestMixin as ImageGPT model takes input_ids as input
     def test_call_pil(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random PIL images
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = image_processing(image_inputs[0], return_tensors="pt").input_ids
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(encoded_images)
-        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
-
-        # Test batched
-        encoded_images = image_processing(image_inputs, return_tensors="pt").input_ids
-        self.assertEqual(
-            tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
-        )
+        for image_processing_class in self.image_processor_list:
+            # Initialize image_processing
+            image_processing = image_processing_class(**self.image_processor_dict)
+            # create random PIL images
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
+            for image in image_inputs:
+                self.assertIsInstance(image, Image.Image)
+
+            # Test not batched input
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt").input_ids
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(encoded_images)
+            self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+
+            # Test batched
+            encoded_images = image_processing(image_inputs, return_tensors="pt").input_ids
+            self.assertEqual(
+                tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
+            )
 
     # Override the test from ImageProcessingTestMixin as ImageGPT model takes input_ids as input
     def test_call_numpy(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random numpy tensors
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = image_processing(image_inputs[0], return_tensors="pt").input_ids
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(encoded_images)
-        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
-
-        # Test batched
-        encoded_images = image_processing(image_inputs, return_tensors="pt").input_ids
-        self.assertEqual(
-            tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
-        )
+        for image_processing_class in self.image_processor_list:
+            # Initialize image_processing
+            image_processing = image_processing_class(**self.image_processor_dict)
+            # create random numpy tensors
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+            for image in image_inputs:
+                self.assertIsInstance(image, np.ndarray)
+
+            # Test not batched input
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt").input_ids
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(encoded_images)
+            self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+
+            # Test batched
+            encoded_images = image_processing(image_inputs, return_tensors="pt").input_ids
+            self.assertEqual(
+                tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
+            )
 
     @unittest.skip(reason="ImageGPT assumes clusters for 3 channels")
     def test_call_numpy_4_channels(self):
@@ -230,25 +232,26 @@ def test_call_numpy_4_channels(self):
 
     # Override the test from ImageProcessingTestMixin as ImageGPT model takes input_ids as input
     def test_call_pytorch(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random PyTorch tensors
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
-
-        for image in image_inputs:
-            self.assertIsInstance(image, torch.Tensor)
-
-        # Test not batched input
-        encoded_images = image_processing(image_inputs[0], return_tensors="pt").input_ids
-        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
-
-        # Test batched
-        encoded_images = image_processing(image_inputs, return_tensors="pt").input_ids
-        self.assertEqual(
-            tuple(encoded_images.shape),
-            (self.image_processor_tester.batch_size, *expected_output_image_shape),
-        )
+        for image_processing_class in self.image_processor_list:
+            # Initialize image_processing
+            image_processing = image_processing_class(**self.image_processor_dict)
+            # create random PyTorch tensors
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+
+            for image in image_inputs:
+                self.assertIsInstance(image, torch.Tensor)
+
+            # Test not batched input
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt").input_ids
+            self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+
+            # Test batched
+            encoded_images = image_processing(image_inputs, return_tensors="pt").input_ids
+            self.assertEqual(
+                tuple(encoded_images.shape),
+                (self.image_processor_tester.batch_size, *expected_output_image_shape),
+            )
 
 
 def prepare_images():

From a020d5fc6a462f889029b03246adabd093d11848 Mon Sep 17 00:00:00 2001
From: Ethan Ayaay <ayaayethan@gmail.com>
Date: Mon, 11 Aug 2025 10:46:30 -0700
Subject: [PATCH 17/33] standardized normalization to not use image mean or std

---
 .../image_processing_imagegpt_fast.py         | 35 +++++++------------
 1 file changed, 13 insertions(+), 22 deletions(-)

diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
index 78819fe2fa12..41a74ecbde98 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
@@ -59,12 +59,10 @@ def color_quantize_fast(x, clusters):
 
 class ImageGPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     # TODO: Add documentation for each argument
-    do_color_quantize: Optional[bool] = True
     clusters: Optional[np.ndarray] = None
+    do_color_quantize: Optional[bool] = True
     resample: Optional[PILImageResampling] = PILImageResampling.BILINEAR
     return_tensors: Optional[Union[str, TensorType]] = None,
-    data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST,
-    input_data_format: Optional[Union[str, ChannelDimension]] = None
 
 @auto_docstring
 class ImageGPTImageProcessorFast(BaseImageProcessorFast):
@@ -81,21 +79,18 @@ class ImageGPTImageProcessorFast(BaseImageProcessorFast):
     resample = PILImageResampling.BILINEAR
     size = {"height": 256, "width": 256} # import get_size_dict?, can be overridden in preprocess
     do_resize = True
-    do_normalize = True
-
-    # Specific Kwargs
+    do_normalize = False
     do_color_quantize = True
     clusters = None
-    resample = PILImageResampling.BILINEAR
 
     # not in base ##########
     image_mean = [0.5, 0.5, 0.5] # not in base, normalize uses a constant factor to divide pixel values
     image_std = [0.5, 0.5, 0.5] # not in base, normalize uses a constant factor to divide pixel values
-    default_to_square = None # not in base
-    crop_size = None # not in base
-    do_center_crop = None # not in base
-    do_rescale = None # not in base
-    do_convert_rgb = None # not in base
+    # default_to_square = None # not in base
+    # crop_size = None # not in base
+    # do_center_crop = None # not in base
+    # do_rescale = None # not in base
+    # do_convert_rgb = None # not in base
     ############
 
     # initialize these arguments, pass it into super constructor
@@ -169,13 +164,6 @@ def _preprocess(
                     size=size,
                     interpolation=interpolation
                 )
-            # Normalize pixel values (if do_normalize is specified)
-            if do_normalize:
-                stacked_images = self.normalize(
-                    image=stacked_images,
-                    mean=image_mean,
-                    std=image_std
-                )
             resized_images_grouped[shape] = stacked_images
 
         # 3. Reorder and maintain original image order after processing into batches
@@ -188,10 +176,13 @@ def _preprocess(
         for shape, stacked_images in grouped_images.items():
             if do_center_crop:
                 stacked_images = self.center_crop(stacked_images, crop_size)
-            # Fused rescale and normalize
-            stacked_images = self.rescale_and_normalize(
-                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            # Fused rescale
+            stacked_images = self.rescale(
+                stacked_images, rescale_factor
             )
+            if do_normalize:
+                stacked_images = (stacked_images / 127.5) - 1.0
+
             processed_images_grouped[shape] = stacked_images
 
         processed_images = reorder_images(processed_images_grouped, grouped_images_index)

From 56b3546e2882e09253aac9795ba0a836752ce770 Mon Sep 17 00:00:00 2001
From: Ethan Ayaay <ayaayethan@gmail.com>
Date: Mon, 11 Aug 2025 13:30:48 -0700
Subject: [PATCH 18/33] Merged changes from solution2 branch

---
 .../image_processing_imagegpt_fast.py         | 320 ++++++++----------
 1 file changed, 143 insertions(+), 177 deletions(-)

diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
index 41a74ecbde98..8d62789121f7 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
@@ -13,206 +13,172 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Fast Image processor class for ImageGPT."""
-import PIL
+
 import numpy as np
-from typing import Dict, List, Optional, Tuple, Union
-
-from transformers.image_transforms import to_channel_dimension_format
-
-from ...image_processing_utils_fast import (
-    BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
-    BatchFeature, logger,
-    group_images_by_shape, reorder_images
-)
-from ...processing_utils import Unpack
-from ...image_utils import (
-    PILImageResampling,
-    ImageInput,
-    ChannelDimension, SizeDict,
-    infer_channel_dimension_format, make_list_of_images, valid_images, validate_preprocess_arguments, is_scaled_image,
-)
-from ...utils import (
-    auto_docstring,
-    is_torch_available,
-    is_torchvision_available,
-    TensorType
-)
-
-if is_torch_available():
-    import torch
-    if is_torchvision_available():
-        from torchvision.transforms import functional as F
-
-def squared_euclidean_distance_fast(a, b):
-    b = b.T
-    a2 = torch.sum(a ** 2, dim = 1)
-    b2 = torch.sum(b ** 2, dim = 0)
-    ab = torch.matmul(a, b)
+import torch
+from typing import Optional, Union
+
+from ...image_processing_utils_fast import BaseImageProcessorFast
+from ...image_utils import PILImageResampling
+from ...utils import auto_docstring
+
+
+def squared_euclidean_distance_torch(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+    """
+    Compute squared Euclidean distances between all pixels and clusters.
+
+    Args:
+        a: (N, 3) tensor of pixel RGB values
+        b: (M, 3) tensor of cluster RGB values
+
+    Returns:
+        (N, M) tensor of squared distances
+    """
+    b = b.t()  # (3, M)
+    a2 = torch.sum(a ** 2, dim=1)  # (N,)
+    b2 = torch.sum(b ** 2, dim=0)  # (M,)
+    ab = torch.matmul(a, b)        # (N, M)
     d = a2[:, None] - 2 * ab + b2[None, :]
     return d
 
-def color_quantize_fast(x, clusters):
-    x = x.reshape(-1, 3)
-    d = squared_euclidean_distance_fast(x, clusters)
+
+def color_quantize_torch(x: torch.Tensor, clusters: torch.Tensor) -> torch.Tensor:
+    """
+    Assign each pixel to its nearest color cluster.
+
+    Args:
+        x: (H*W, 3) tensor of flattened pixel RGB values
+        clusters: (n_clusters, 3) tensor of cluster RGB values
+
+    Returns:
+        (H*W,) tensor of cluster indices
+    """
+    d = squared_euclidean_distance_torch(x, clusters)
     return torch.argmin(d, dim=1)
 
-class ImageGPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    # TODO: Add documentation for each argument
-    clusters: Optional[np.ndarray] = None
-    do_color_quantize: Optional[bool] = True
-    resample: Optional[PILImageResampling] = PILImageResampling.BILINEAR
-    return_tensors: Optional[Union[str, TensorType]] = None,
 
 @auto_docstring
 class ImageGPTImageProcessorFast(BaseImageProcessorFast):
-    # This generated class can be used as a starting point for the fast image processor.
-    # if the image processor is only used for simple augmentations, such as resizing, center cropping, rescaling, or normalizing,
-    # only the default values should be set in the class.
-    # If the image processor requires more complex augmentations, methods from BaseImageProcessorFast can be overridden.
-    # In most cases, only the `_preprocess` method should be overridden.
+    """
+    Constructs a fast ImageGPT image processor.
 
-    # For an example of a fast image processor requiring more complex augmentations, see `LlavaNextImageProcessorFast`.
+    This processor can be used to resize images to a smaller resolution (such as 32x32 or 64x64),
+    normalize them and finally color quantize them to obtain sequences of "pixel values" (color clusters).
+    """
 
-    # Default values should be checked against the slow image processor
-    # None values left after checking can be removed
+    model_input_names = ["input_ids"]
+
+    # Defaults largely aligned with the slow processor, except normalization which we do manually to [-1, 1]
     resample = PILImageResampling.BILINEAR
-    size = {"height": 256, "width": 256} # import get_size_dict?, can be overridden in preprocess
+    size = {"height": 256, "width": 256}
     do_resize = True
+    # We do NOT use the base normalization/rescale as ImageGPT expects (x/127.5 - 1)
+    do_rescale = False
     do_normalize = False
-    do_color_quantize = True
-    clusters = None
-
-    # not in base ##########
-    image_mean = [0.5, 0.5, 0.5] # not in base, normalize uses a constant factor to divide pixel values
-    image_std = [0.5, 0.5, 0.5] # not in base, normalize uses a constant factor to divide pixel values
-    # default_to_square = None # not in base
-    # crop_size = None # not in base
-    # do_center_crop = None # not in base
-    # do_rescale = None # not in base
-    # do_convert_rgb = None # not in base
-    ############
-
-    # initialize these arguments, pass it into super constructor
-    def __init__(self, **kwargs: Unpack[ImageGPTFastImageProcessorKwargs]):
-        super().__init__(**kwargs)
 
-    # _preprocessor has additional kwargs:
-        # images, return_tensors, data_format, input_data_format
+    do_color_quantize = True
+    clusters = None  # Must be set at instantiation
 
-    # PUBLIC preprocess:
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[ImageGPTFastImageProcessorKwargs]) -> BatchFeature:
-        return super().preprocess(images, **kwargs)
+    def __init__(
+        self,
+        clusters: Optional[Union[list, np.ndarray]] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        # Store clusters as numpy for JSON serializability. Convert to torch in _preprocess when needed.
+        if clusters is not None:
+            self.clusters = np.asarray(clusters, dtype=np.float32)
+        else:
+            self.clusters = None
+        # Default: follow ImageGPT behavior (normalize by default). We stash here and force base to skip.
+        self._do_normalize_imagegpt = kwargs.get("do_normalize", True)
+
+    def _further_process_kwargs(self, **kwargs):
+        # Let the base process size/crop and other standard kwargs first
+        kwargs = super()._further_process_kwargs(**kwargs)
+        if "do_normalize" in kwargs and kwargs["do_normalize"] is not None:
+            self._do_normalize_imagegpt = kwargs["do_normalize"]
+        # Force base pipeline to skip its rescale/normalize validation and logic
+        kwargs["do_rescale"] = False
+        kwargs["do_normalize"] = False
+        return kwargs
 
-    # PRIVATE preprocess:
     def _preprocess(
         self,
-        images: list["torch.Tensor"],
-        do_resize: bool,
-        size: SizeDict,
-        interpolation: Optional["F.InterpolationMode"],
-        do_center_crop: bool,
-        crop_size: SizeDict,
-        do_rescale: bool,
-        rescale_factor: float,
-        do_normalize: bool,
-        image_mean: Optional[Union[float, list[float]]] = None,
-        image_std: Optional[Union[float, list[float]]] = None,
-        disable_grouping: Optional[bool] = False,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        resample: Optional[PILImageResampling] = PILImageResampling.BILINEAR,
+        images,
         do_color_quantize: Optional[bool] = None,
-        clusters: Optional[Union[list[list[int]], np.ndarray]] = None,
-        data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> BatchFeature:
-        do_resize = do_resize if do_resize is not None else self.do_resize
-        size = size if size is not None else self.size
-        resample = resample if resample is not None else self.resample
-        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
-        do_color_quantize = do_color_quantize if do_color_quantize is not None else self.do_color_quantize
-        clusters = clusters if clusters is not None else self.clusters
-        image_mean = image_mean if image_mean is not None else self.image_mean
-        image_std = image_std if image_std is not None else self.image_std
-
-        # 1. Setup. Validate ImageGPT-specific requirements
-        # Check for do_color_quantize and clusters.
-        if do_color_quantize and clusters is None:
-            raise ValueError("Clusters must be specified if do_color_quantize is True.")
-
-        # Convert images to torch float32
-        images = [image.to(torch.float32) for image in images]
-
-        # Clusters come in np arrays. Convert to torch tensors.
-        cluster_tensors = None
-        if clusters is not None:
-            cluster_tensors = torch.tensor(clusters, dtype=torch.float32)
-            if len(images) > 0 and images[0].is_cuda:
-                # if image is stored on a CUDA GPA, convert tensors to CUDA
-                cluster_tensors = cluster_tensors.cuda()
-
-        # 2. Group images into batches of the same shape for more efficient processing.
-        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
-        resized_images_grouped = {}
-        # Loop through shapes and stacked images
-        for shape, stacked_images in grouped_images.items():
-            # Resize to specific sizes (if do_resize is specified)
-            if do_resize:
-                stacked_images = self.resize(
-                    image=stacked_images,
-                    size=size,
-                    interpolation=interpolation
-                )
-            resized_images_grouped[shape] = stacked_images
-
-        # 3. Reorder and maintain original image order after processing into batches
-        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
-
-        # Group images by size for further processing
-        # Needed in case do_resize is False, or resize returns images with different sizes
-        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
-        processed_images_grouped = {}
-        for shape, stacked_images in grouped_images.items():
-            if do_center_crop:
-                stacked_images = self.center_crop(stacked_images, crop_size)
-            # Fused rescale
-            stacked_images = self.rescale(
-                stacked_images, rescale_factor
-            )
+        clusters: Optional[Union[list, np.ndarray, torch.Tensor]] = None,
+        return_tensors: Optional[str] = None,
+        **kwargs
+    ):
+        # Run standard fast pipeline (resize, crop, batching) without rescale/normalize
+        base_batch = super()._preprocess(images, return_tensors=return_tensors, **kwargs)
+        pixel_values = base_batch["pixel_values"]  # Tensor [B,C,H,W] or list of [C,H,W]
+
+        # Apply ImageGPT normalization when requested: [-1, 1]
+        do_normalize = getattr(self, "_do_normalize_imagegpt", True)
+        if isinstance(pixel_values, torch.Tensor):
+            normalized = pixel_values.to(dtype=torch.float32)
             if do_normalize:
-                stacked_images = (stacked_images / 127.5) - 1.0
-
-            processed_images_grouped[shape] = stacked_images
-
-        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
-        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+                normalized = normalized / 127.5 - 1.0
+        else:
+            normalized = [img.to(dtype=torch.float32) for img in pixel_values]
+            if do_normalize:
+                normalized = [img / 127.5 - 1.0 for img in normalized]
 
-        # 4. Color quantize if specified
+        # If color quantization is requested, perform it; otherwise return pixel values
+        do_color_quantize = do_color_quantize if do_color_quantize is not None else self.do_color_quantize
         if do_color_quantize:
-            quantized_images = []
-            for image in processed_images:
-                # Convert CHW to HWC for quantization
-                image_hwc = image.permute(1, 2, 0)  # (H, W, C)
-
-                # Denormalize back to [0, 255] for quantization
-                image_hwc = (image_hwc + 1.0) * 127.5
-                image_hwc = torch.clamp(image_hwc, 0, 255)
+            # Prepare clusters
+            clusters = clusters if clusters is not None else self.clusters
+            if clusters is None:
+                raise ValueError("Clusters must be provided for color quantization.")
+            clusters_torch = torch.as_tensor(clusters, dtype=torch.float32)
+
+            # Helper for clarity: quantize a single image [C,H,W] -> [H*W]
+            def _quantize_one_image(image_chw: torch.Tensor, clusters_ref: torch.Tensor) -> torch.Tensor:
+                device_clusters = clusters_ref.to(image_chw.device, dtype=image_chw.dtype)
+                img_hwc = image_chw.permute(1, 2, 0)
+                pixels = img_hwc.reshape(-1, 3)
+                return color_quantize_torch(pixels, device_clusters)
+
+            if isinstance(normalized, torch.Tensor):
+                images_list = [img for img in normalized]
+            else:
+                images_list = list(normalized)
+
+            ids_list = [_quantize_one_image(img, clusters_torch) for img in images_list]
+
+            if return_tensors == "pt":
+                input_ids = torch.stack(ids_list, dim=0)
+                pixel_values_out = torch.stack(images_list, dim=0)
+            else:
+                input_ids = ids_list
+                pixel_values_out = images_list
+
+            from ...image_processing_utils import BatchFeature
+            return BatchFeature(data={"input_ids": input_ids, "pixel_values": pixel_values_out}, tensor_type=return_tensors)
+
+        # Otherwise, return pixel values (normalized or not depending on flag)
+        base_batch["pixel_values"] = normalized
+        return base_batch
+
+    def to_dict(self):
+        # Convert numpy arrays to lists for JSON serialization
+        output = super().to_dict()
+        if output.get("clusters") is not None and isinstance(output["clusters"], np.ndarray):
+            output["clusters"] = output["clusters"].tolist()
+        # ImageGPT does not use base mean/std normalization; keep these None for parity with slow processor
+        # output["image_mean"] = None
+        # output["image_std"] = None
+        # No rescaling in fast ImageGPT path
+
+        #Need to set these valus to match with slow processor during testing
+        output["rescale_factor"] = None
+        output["do_rescale"] = None
+        output["do_color_quantize"] = bool(getattr(self, "do_color_quantize", True))
+        output.pop("_do_normalize_imagegpt", None)
+        return output
 
-                # Fast torch-based color quantization
-                quantized = color_quantize_fast(image_hwc, cluster_tensors)
-
-                # Flatten to sequence (H*W,)
-                quantized_flat = quantized.view(-1)
-                quantized_images.append(quantized_flat)
-
-            # Stack all quantized sequences
-            input_ids = torch.stack(quantized_images, dim=0)
-
-            return BatchFeature(data={"input_ids": input_ids}, tensor_type=return_tensors)
-        else:
-        # 5. Standard output without quantizing
-            pixel_values = torch.stack(processed_images, dim=0)
-            return BatchFeature(data={"pixel_values": pixel_values}, tensor_type=return_tensors)
 
 __all__ = ["ImageGPTImageProcessorFast"]

From 0b21bff9e878b3e0a5a9a7c0721c26f0aa295c70 Mon Sep 17 00:00:00 2001
From: Ethan Ayaay <ayaayethan@gmail.com>
Date: Mon, 11 Aug 2025 13:31:39 -0700
Subject: [PATCH 19/33] Merged changes from solution2 test file

---
 .../test_image_processing_imagegpt.py         | 51 ++++++++++---------
 1 file changed, 27 insertions(+), 24 deletions(-)

diff --git a/tests/models/imagegpt/test_image_processing_imagegpt.py b/tests/models/imagegpt/test_image_processing_imagegpt.py
index 56eaf32895e8..12a83b25c1d8 100644
--- a/tests/models/imagegpt/test_image_processing_imagegpt.py
+++ b/tests/models/imagegpt/test_image_processing_imagegpt.py
@@ -116,43 +116,46 @@ def test_image_processor_properties(self):
             self.assertTrue(hasattr(image_processing, "do_normalize"))
 
     def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
-        self.assertEqual(image_processor.size, {"height": 18, "width": 18})
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class.from_dict(self.image_processor_dict)
+            self.assertEqual(image_processor.size, {"height": 18, "width": 18})
 
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
-        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
+            image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42)
+            self.assertEqual(image_processor.size, {"height": 42, "width": 42})
 
     def test_image_processor_to_json_string(self):
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-        obj = json.loads(image_processor.to_json_string())
-        for key, value in self.image_processor_dict.items():
-            if key == "clusters":
-                self.assertTrue(np.array_equal(value, obj[key]))
-            else:
-                self.assertEqual(obj[key], value)
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class(**self.image_processor_dict)
+            obj = json.loads(image_processor.to_json_string())
+            for key, value in self.image_processor_dict.items():
+                if key == "clusters":
+                    self.assertTrue(np.array_equal(value, obj[key]))
+                else:
+                    self.assertEqual(obj[key], value)
 
     def test_image_processor_to_json_file(self):
-        image_processor_first = self.image_processing_class(**self.image_processor_dict)
+        for image_processing_class in self.image_processor_list:
+            image_processor_first = image_processing_class(**self.image_processor_dict)
 
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            json_file_path = os.path.join(tmpdirname, "image_processor.json")
-            image_processor_first.to_json_file(json_file_path)
-            image_processor_second = self.image_processing_class.from_json_file(json_file_path).to_dict()
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                json_file_path = os.path.join(tmpdirname, "image_processor.json")
+                image_processor_first.to_json_file(json_file_path)
+                image_processor_second = image_processing_class.from_json_file(json_file_path).to_dict()
 
-        image_processor_first = image_processor_first.to_dict()
-        for key, value in image_processor_first.items():
-            if key == "clusters":
-                self.assertTrue(np.array_equal(value, image_processor_second[key]))
-            else:
-                self.assertEqual(image_processor_first[key], value)
+            image_processor_first = image_processor_first.to_dict()
+            for key, value in image_processor_first.items():
+                if key == "clusters":
+                    self.assertTrue(np.array_equal(value, image_processor_second[key]))
+                else:
+                    self.assertEqual(image_processor_first[key], value)
 
     def test_image_processor_from_and_save_pretrained(self):
         for image_processing_class in self.image_processor_list:
-            image_processor_first = self.image_processing_class(**self.image_processor_dict)
+            image_processor_first = image_processing_class(**self.image_processor_dict)
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 image_processor_first.save_pretrained(tmpdirname)
-                image_processor_second = self.image_processing_class.from_pretrained(tmpdirname).to_dict()
+                image_processor_second = image_processing_class.from_pretrained(tmpdirname).to_dict()
 
             image_processor_first = image_processor_first.to_dict()
             for key, value in image_processor_first.items():

From e98d5faab85171af4f49f3082d36fadb165a2733 Mon Sep 17 00:00:00 2001
From: agamjots <email.agamjotsingh@gmail.com>
Date: Mon, 11 Aug 2025 14:05:40 -0700
Subject: [PATCH 20/33] fixed testing through baseImageGPT processor file

---
 .../imagegpt/image_processing_imagegpt.py       | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt.py b/src/transformers/models/imagegpt/image_processing_imagegpt.py
index 5d3e207dd3eb..a264b948848a 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt.py
@@ -279,6 +279,12 @@ def preprocess(
         if do_normalize:
             images = [self.normalize(image=image, input_data_format=input_data_format) for image in images]
 
+        # Need pixel_values (normalized, channels_first) for equivalence tests
+        pixel_values = [
+            to_channel_dimension_format(image, ChannelDimension.FIRST, input_channel_dim=input_data_format)
+            for image in images
+        ]
+
         if do_color_quantize:
             images = [to_channel_dimension_format(image, ChannelDimension.LAST, input_data_format) for image in images]
             # color quantize from (batch_size, height, width, 3) to (batch_size, height, width)
@@ -291,14 +297,21 @@ def preprocess(
 
             # We need to convert back to a list of images to keep consistent behaviour across processors.
             images = list(images)
+            data = {"input_ids": images, "pixel_values": pixel_values}
         else:
             images = [
                 to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
                 for image in images
             ]
-
-        data = {"input_ids": images}
+            data = {"pixel_values": images}
         return BatchFeature(data=data, tensor_type=return_tensors)
 
+    def to_dict(self):
+        output = super().to_dict()
+        # Ensure clusters are JSON/equality friendly
+        if output.get("clusters") is not None and isinstance(output["clusters"], np.ndarray):
+            output["clusters"] = output["clusters"].tolist()
+        return output
+
 
 __all__ = ["ImageGPTImageProcessor"]

From f3b0a8c80d6e4534fa9418ad1cfaf19cfdf18525 Mon Sep 17 00:00:00 2001
From: Ethan Ayaay <ayaayethan@gmail.com>
Date: Tue, 12 Aug 2025 11:02:59 -0700
Subject: [PATCH 21/33] Fixed check_code_quality test. Removed unncessary list
 comprehension.

---
 .../models/imagegpt/image_processing_imagegpt_fast.py        | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
index 8d62789121f7..cff81df60c77 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
@@ -142,10 +142,7 @@ def _quantize_one_image(image_chw: torch.Tensor, clusters_ref: torch.Tensor) ->
                 pixels = img_hwc.reshape(-1, 3)
                 return color_quantize_torch(pixels, device_clusters)
 
-            if isinstance(normalized, torch.Tensor):
-                images_list = [img for img in normalized]
-            else:
-                images_list = list(normalized)
+            images_list = list(normalized)
 
             ids_list = [_quantize_one_image(img, clusters_torch) for img in images_list]
 

From 43c61716443056256d67e196d0d66c2cf7aad22b Mon Sep 17 00:00:00 2001
From: Ethan Ayaay <ayaayethan@gmail.com>
Date: Tue, 12 Aug 2025 11:08:30 -0700
Subject: [PATCH 22/33] reorganized imports in image_processing_imagegpt_fast

---
 .../models/imagegpt/image_processing_imagegpt_fast.py          | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
index cff81df60c77..c54a89892483 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
@@ -14,9 +14,10 @@
 # limitations under the License.
 """Fast Image processor class for ImageGPT."""
 
+from typing import Optional, Union
+
 import numpy as np
 import torch
-from typing import Optional, Union
 
 from ...image_processing_utils_fast import BaseImageProcessorFast
 from ...image_utils import PILImageResampling

From 5bb6d5aed161ec158ca7f27df88efb36fbdb86f9 Mon Sep 17 00:00:00 2001
From: Ethan Ayaay <ayaayethan@gmail.com>
Date: Tue, 12 Aug 2025 12:12:38 -0700
Subject: [PATCH 23/33] formatted image_processing_imagegpt_fast.py

---
 .../image_processing_imagegpt_fast.py         | 38 +++++++++++++------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
index c54a89892483..c34f9e150ab2 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
@@ -36,9 +36,9 @@ def squared_euclidean_distance_torch(a: torch.Tensor, b: torch.Tensor) -> torch.
         (N, M) tensor of squared distances
     """
     b = b.t()  # (3, M)
-    a2 = torch.sum(a ** 2, dim=1)  # (N,)
-    b2 = torch.sum(b ** 2, dim=0)  # (M,)
-    ab = torch.matmul(a, b)        # (N, M)
+    a2 = torch.sum(a**2, dim=1)  # (N,)
+    b2 = torch.sum(b**2, dim=0)  # (M,)
+    ab = torch.matmul(a, b)  # (N, M)
     d = a2[:, None] - 2 * ab + b2[None, :]
     return d
 
@@ -110,10 +110,12 @@ def _preprocess(
         do_color_quantize: Optional[bool] = None,
         clusters: Optional[Union[list, np.ndarray, torch.Tensor]] = None,
         return_tensors: Optional[str] = None,
-        **kwargs
+        **kwargs,
     ):
         # Run standard fast pipeline (resize, crop, batching) without rescale/normalize
-        base_batch = super()._preprocess(images, return_tensors=return_tensors, **kwargs)
+        base_batch = super()._preprocess(
+            images, return_tensors=return_tensors, **kwargs
+        )
         pixel_values = base_batch["pixel_values"]  # Tensor [B,C,H,W] or list of [C,H,W]
 
         # Apply ImageGPT normalization when requested: [-1, 1]
@@ -128,7 +130,11 @@ def _preprocess(
                 normalized = [img / 127.5 - 1.0 for img in normalized]
 
         # If color quantization is requested, perform it; otherwise return pixel values
-        do_color_quantize = do_color_quantize if do_color_quantize is not None else self.do_color_quantize
+        do_color_quantize = (
+            do_color_quantize
+            if do_color_quantize is not None
+            else self.do_color_quantize
+        )
         if do_color_quantize:
             # Prepare clusters
             clusters = clusters if clusters is not None else self.clusters
@@ -137,8 +143,12 @@ def _preprocess(
             clusters_torch = torch.as_tensor(clusters, dtype=torch.float32)
 
             # Helper for clarity: quantize a single image [C,H,W] -> [H*W]
-            def _quantize_one_image(image_chw: torch.Tensor, clusters_ref: torch.Tensor) -> torch.Tensor:
-                device_clusters = clusters_ref.to(image_chw.device, dtype=image_chw.dtype)
+            def _quantize_one_image(
+                image_chw: torch.Tensor, clusters_ref: torch.Tensor
+            ) -> torch.Tensor:
+                device_clusters = clusters_ref.to(
+                    image_chw.device, dtype=image_chw.dtype
+                )
                 img_hwc = image_chw.permute(1, 2, 0)
                 pixels = img_hwc.reshape(-1, 3)
                 return color_quantize_torch(pixels, device_clusters)
@@ -155,7 +165,11 @@ def _quantize_one_image(image_chw: torch.Tensor, clusters_ref: torch.Tensor) ->
                 pixel_values_out = images_list
 
             from ...image_processing_utils import BatchFeature
-            return BatchFeature(data={"input_ids": input_ids, "pixel_values": pixel_values_out}, tensor_type=return_tensors)
+
+            return BatchFeature(
+                data={"input_ids": input_ids, "pixel_values": pixel_values_out},
+                tensor_type=return_tensors,
+            )
 
         # Otherwise, return pixel values (normalized or not depending on flag)
         base_batch["pixel_values"] = normalized
@@ -164,14 +178,16 @@ def _quantize_one_image(image_chw: torch.Tensor, clusters_ref: torch.Tensor) ->
     def to_dict(self):
         # Convert numpy arrays to lists for JSON serialization
         output = super().to_dict()
-        if output.get("clusters") is not None and isinstance(output["clusters"], np.ndarray):
+        if output.get("clusters") is not None and isinstance(
+            output["clusters"], np.ndarray
+        ):
             output["clusters"] = output["clusters"].tolist()
         # ImageGPT does not use base mean/std normalization; keep these None for parity with slow processor
         # output["image_mean"] = None
         # output["image_std"] = None
         # No rescaling in fast ImageGPT path
 
-        #Need to set these valus to match with slow processor during testing
+        # Need to set these valus to match with slow processor during testing
         output["rescale_factor"] = None
         output["do_rescale"] = None
         output["do_color_quantize"] = bool(getattr(self, "do_color_quantize", True))

From e575ba74bd625856ca8c22e62ce246250189e373 Mon Sep 17 00:00:00 2001
From: chris <christine05789@gmail.com>
Date: Tue, 12 Aug 2025 12:25:43 -0700
Subject: [PATCH 24/33] Added arg documentation

---
 .../imagegpt/image_processing_imagegpt_fast.py     | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
index c34f9e150ab2..add2527cf1fc 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
@@ -112,6 +112,20 @@ def _preprocess(
         return_tensors: Optional[str] = None,
         **kwargs,
     ):
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            clusters (`np.ndarray`, `list[list[float]]`, or `torch.Tensor`, *optional*, defaults to `self.clusters`):
+                Clusters used to quantize the image of shape `(n_clusters, 3)`. Only has an effect if
+                `do_color_quantize` is set to `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `torch.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+        """
         # Run standard fast pipeline (resize, crop, batching) without rescale/normalize
         base_batch = super()._preprocess(
             images, return_tensors=return_tensors, **kwargs

From 25bd8ac477be3362533ba5bd2986d83ab2c7a921 Mon Sep 17 00:00:00 2001
From: chris <christine05789@gmail.com>
Date: Tue, 12 Aug 2025 13:36:25 -0700
Subject: [PATCH 25/33] Added FastImageProcessorKwargs class + Docs for new
 kwargs

---
 .../image_processing_imagegpt_fast.py         | 59 +++++++++++++++++--
 1 file changed, 54 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
index add2527cf1fc..1f907366f055 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
@@ -17,12 +17,25 @@
 from typing import Optional, Union
 
 import numpy as np
-import torch
-
-from ...image_processing_utils_fast import BaseImageProcessorFast
-from ...image_utils import PILImageResampling
-from ...utils import auto_docstring
 
+from ...image_processing_utils_fast import BaseImageProcessorFast, DefaultFastImageProcessorKwargs
+from ...image_utils import PILImageResampling, ChannelDimension
+from ...utils import (
+    TensorType,
+    auto_docstring,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available
+)
+
+if is_torch_available():
+    import torch
+
+if is_torchvision_available():
+    if is_torchvision_v2_available():
+        from torchvision.transforms.v2 import functional as F
+    else:
+        from torchvision.transforms import functional as F
 
 def squared_euclidean_distance_torch(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
     """
@@ -57,6 +70,42 @@ def color_quantize_torch(x: torch.Tensor, clusters: torch.Tensor) -> torch.Tenso
     d = squared_euclidean_distance_torch(x, clusters)
     return torch.argmin(d, dim=1)
 
+class ImageGPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    """
+    clusters (`np.ndarray` or `list[list[int]]`, *optional*):
+        The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters`
+        in `preprocess`.
+    resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+        Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`.
+    return_tensors (`str` or `TensorType`, *optional*):
+        The type of tensors to return. Can be one of:
+            - Unset: Return a list of `torch.Tensor`.
+            - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+            - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+            - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+    data_format (`ChannelDimension` or `str`, *optional*):
+        The channel dimension format for the output image. If unset, the channel dimension format of the input
+        image is used. Can be one of:
+            - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+            - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+    input_data_format (`ChannelDimension` or `str`, *optional*):
+        The channel dimension format for the input image. If unset, the channel dimension format is inferred
+        from the input image. Can be one of:
+            - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+            - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+    do_color_quantize (`bool`, *optional*, defaults to `self.do_color_quantize`):
+        Whether to color quantize the image.
+    """
+
+    clusters: Optional[np.ndarray] = None
+    resample: Optional[PILImageResampling] = PILImageResampling.BILINEAR
+    return_tensors: Optional[Union[str, TensorType]] = None,
+    data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None
+    do_color_quantize: Optional[bool] = True
+
 
 @auto_docstring
 class ImageGPTImageProcessorFast(BaseImageProcessorFast):

From ff89353976336eb5d3c67a8650c7417a181d2aa1 Mon Sep 17 00:00:00 2001
From: chris <christine05789@gmail.com>
Date: Tue, 12 Aug 2025 13:39:20 -0700
Subject: [PATCH 26/33] Reformatted previous

---
 .../image_processing_imagegpt_fast.py         | 35 ++++++++-----------
 1 file changed, 14 insertions(+), 21 deletions(-)

diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
index 1f907366f055..7059a69496f1 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
@@ -18,14 +18,17 @@
 
 import numpy as np
 
-from ...image_processing_utils_fast import BaseImageProcessorFast, DefaultFastImageProcessorKwargs
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    DefaultFastImageProcessorKwargs,
+)
 from ...image_utils import PILImageResampling, ChannelDimension
 from ...utils import (
     TensorType,
     auto_docstring,
     is_torch_available,
     is_torchvision_available,
-    is_torchvision_v2_available
+    is_torchvision_v2_available,
 )
 
 if is_torch_available():
@@ -37,6 +40,7 @@
     else:
         from torchvision.transforms import functional as F
 
+
 def squared_euclidean_distance_torch(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
     """
     Compute squared Euclidean distances between all pixels and clusters.
@@ -70,6 +74,7 @@ def color_quantize_torch(x: torch.Tensor, clusters: torch.Tensor) -> torch.Tenso
     d = squared_euclidean_distance_torch(x, clusters)
     return torch.argmin(d, dim=1)
 
+
 class ImageGPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     """
     clusters (`np.ndarray` or `list[list[int]]`, *optional*):
@@ -101,8 +106,8 @@ class ImageGPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
 
     clusters: Optional[np.ndarray] = None
     resample: Optional[PILImageResampling] = PILImageResampling.BILINEAR
-    return_tensors: Optional[Union[str, TensorType]] = None,
-    data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST,
+    return_tensors: Optional[Union[str, TensorType]] = (None,)
+    data_format: Optional[Union[str, ChannelDimension]] = (ChannelDimension.FIRST,)
     input_data_format: Optional[Union[str, ChannelDimension]] = None
     do_color_quantize: Optional[bool] = True
 
@@ -176,9 +181,7 @@ def _preprocess(
                     - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
         """
         # Run standard fast pipeline (resize, crop, batching) without rescale/normalize
-        base_batch = super()._preprocess(
-            images, return_tensors=return_tensors, **kwargs
-        )
+        base_batch = super()._preprocess(images, return_tensors=return_tensors, **kwargs)
         pixel_values = base_batch["pixel_values"]  # Tensor [B,C,H,W] or list of [C,H,W]
 
         # Apply ImageGPT normalization when requested: [-1, 1]
@@ -193,11 +196,7 @@ def _preprocess(
                 normalized = [img / 127.5 - 1.0 for img in normalized]
 
         # If color quantization is requested, perform it; otherwise return pixel values
-        do_color_quantize = (
-            do_color_quantize
-            if do_color_quantize is not None
-            else self.do_color_quantize
-        )
+        do_color_quantize = do_color_quantize if do_color_quantize is not None else self.do_color_quantize
         if do_color_quantize:
             # Prepare clusters
             clusters = clusters if clusters is not None else self.clusters
@@ -206,12 +205,8 @@ def _preprocess(
             clusters_torch = torch.as_tensor(clusters, dtype=torch.float32)
 
             # Helper for clarity: quantize a single image [C,H,W] -> [H*W]
-            def _quantize_one_image(
-                image_chw: torch.Tensor, clusters_ref: torch.Tensor
-            ) -> torch.Tensor:
-                device_clusters = clusters_ref.to(
-                    image_chw.device, dtype=image_chw.dtype
-                )
+            def _quantize_one_image(image_chw: torch.Tensor, clusters_ref: torch.Tensor) -> torch.Tensor:
+                device_clusters = clusters_ref.to(image_chw.device, dtype=image_chw.dtype)
                 img_hwc = image_chw.permute(1, 2, 0)
                 pixels = img_hwc.reshape(-1, 3)
                 return color_quantize_torch(pixels, device_clusters)
@@ -241,9 +236,7 @@ def _quantize_one_image(
     def to_dict(self):
         # Convert numpy arrays to lists for JSON serialization
         output = super().to_dict()
-        if output.get("clusters") is not None and isinstance(
-            output["clusters"], np.ndarray
-        ):
+        if output.get("clusters") is not None and isinstance(output["clusters"], np.ndarray):
             output["clusters"] = output["clusters"].tolist()
         # ImageGPT does not use base mean/std normalization; keep these None for parity with slow processor
         # output["image_mean"] = None

From a1b2e7f6fc4b79c2d55bfe9cfea2f90590614762 Mon Sep 17 00:00:00 2001
From: chris <christine05789@gmail.com>
Date: Tue, 12 Aug 2025 14:42:09 -0700
Subject: [PATCH 27/33] Added F to normalization

---
 .../models/imagegpt/image_processing_imagegpt_fast.py    | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
index 7059a69496f1..4e5f05a8f28e 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
@@ -23,6 +23,7 @@
     DefaultFastImageProcessorKwargs,
 )
 from ...image_utils import PILImageResampling, ChannelDimension
+from ...processing_utils import Unpack
 from ...utils import (
     TensorType,
     auto_docstring,
@@ -137,7 +138,7 @@ class ImageGPTImageProcessorFast(BaseImageProcessorFast):
     def __init__(
         self,
         clusters: Optional[Union[list, np.ndarray]] = None,
-        **kwargs,
+        **kwargs: Unpack[ImageGPTFastImageProcessorKwargs],
     ):
         super().__init__(**kwargs)
         # Store clusters as numpy for JSON serializability. Convert to torch in _preprocess when needed.
@@ -164,7 +165,7 @@ def _preprocess(
         do_color_quantize: Optional[bool] = None,
         clusters: Optional[Union[list, np.ndarray, torch.Tensor]] = None,
         return_tensors: Optional[str] = None,
-        **kwargs,
+        **kwargs: Unpack[ImageGPTFastImageProcessorKwargs],
     ):
         """
         Preprocess an image or batch of images.
@@ -189,11 +190,11 @@ def _preprocess(
         if isinstance(pixel_values, torch.Tensor):
             normalized = pixel_values.to(dtype=torch.float32)
             if do_normalize:
-                normalized = normalized / 127.5 - 1.0
+                normalized = F.normalize(normalized, mean=[0.0], std=[127.5]) - 1.0
         else:
             normalized = [img.to(dtype=torch.float32) for img in pixel_values]
             if do_normalize:
-                normalized = [img / 127.5 - 1.0 for img in normalized]
+                normalized = [F.normalize(img, mean=[0.0], std=[127.5]) - 1.0 for img in normalized]
 
         # If color quantization is requested, perform it; otherwise return pixel values
         do_color_quantize = do_color_quantize if do_color_quantize is not None else self.do_color_quantize

From cd4d0637eb8a50e1373205c524784f38d9f83095 Mon Sep 17 00:00:00 2001
From: agamjots <email.agamjotsingh@gmail.com>
Date: Tue, 12 Aug 2025 18:06:09 -0700
Subject: [PATCH 28/33] fixed ruff linting and cleaned up fast processor file

---
 .../imagegpt/image_processing_imagegpt_fast.py     | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
index 4e5f05a8f28e..b65ac4997e08 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
@@ -22,7 +22,7 @@
     BaseImageProcessorFast,
     DefaultFastImageProcessorKwargs,
 )
-from ...image_utils import PILImageResampling, ChannelDimension
+from ...image_utils import ChannelDimension, PILImageResampling
 from ...processing_utils import Unpack
 from ...utils import (
     TensorType,
@@ -32,6 +32,7 @@
     is_torchvision_v2_available,
 )
 
+
 if is_torch_available():
     import torch
 
@@ -128,12 +129,12 @@ class ImageGPTImageProcessorFast(BaseImageProcessorFast):
     resample = PILImageResampling.BILINEAR
     size = {"height": 256, "width": 256}
     do_resize = True
-    # We do NOT use the base normalization/rescale as ImageGPT expects (x/127.5 - 1)
+    # We do NOT want touse the base normalization/rescale as ImageGPT expects (x/127.5 - 1)
     do_rescale = False
     do_normalize = False
 
     do_color_quantize = True
-    clusters = None  # Must be set at instantiation
+    clusters = None
 
     def __init__(
         self,
@@ -183,7 +184,7 @@ def _preprocess(
         """
         # Run standard fast pipeline (resize, crop, batching) without rescale/normalize
         base_batch = super()._preprocess(images, return_tensors=return_tensors, **kwargs)
-        pixel_values = base_batch["pixel_values"]  # Tensor [B,C,H,W] or list of [C,H,W]
+        pixel_values = base_batch["pixel_values"]
 
         # Apply ImageGPT normalization when requested: [-1, 1]
         do_normalize = getattr(self, "_do_normalize_imagegpt", True)
@@ -205,7 +206,6 @@ def _preprocess(
                 raise ValueError("Clusters must be provided for color quantization.")
             clusters_torch = torch.as_tensor(clusters, dtype=torch.float32)
 
-            # Helper for clarity: quantize a single image [C,H,W] -> [H*W]
             def _quantize_one_image(image_chw: torch.Tensor, clusters_ref: torch.Tensor) -> torch.Tensor:
                 device_clusters = clusters_ref.to(image_chw.device, dtype=image_chw.dtype)
                 img_hwc = image_chw.permute(1, 2, 0)
@@ -239,10 +239,6 @@ def to_dict(self):
         output = super().to_dict()
         if output.get("clusters") is not None and isinstance(output["clusters"], np.ndarray):
             output["clusters"] = output["clusters"].tolist()
-        # ImageGPT does not use base mean/std normalization; keep these None for parity with slow processor
-        # output["image_mean"] = None
-        # output["image_std"] = None
-        # No rescaling in fast ImageGPT path
 
         # Need to set these valus to match with slow processor during testing
         output["rescale_factor"] = None

From 1315a98c3307475658d5facb471ff1b35c83b75f Mon Sep 17 00:00:00 2001
From: agamjots <email.agamjotsingh@gmail.com>
Date: Thu, 21 Aug 2025 22:40:03 -0700
Subject: [PATCH 29/33] implemented requested changes

---
 .../imagegpt/image_processing_imagegpt.py     |  13 +-
 .../image_processing_imagegpt_fast.py         | 259 ++++++++++--------
 .../test_image_processing_imagegpt.py         |  53 ++++
 3 files changed, 202 insertions(+), 123 deletions(-)

diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt.py b/src/transformers/models/imagegpt/image_processing_imagegpt.py
index a264b948848a..c29e436f17fa 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt.py
@@ -297,10 +297,10 @@ def preprocess(
 
             # We need to convert back to a list of images to keep consistent behaviour across processors.
             images = list(images)
-            data = {"input_ids": images, "pixel_values": pixel_values}
+            data = {"input_ids": images}
         else:
             images = [
-                to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+                to_channel_dimension_format(image, data_format, input_data_format)
                 for image in images
             ]
             data = {"pixel_values": images}
@@ -311,6 +311,15 @@ def to_dict(self):
         # Ensure clusters are JSON/equality friendly
         if output.get("clusters") is not None and isinstance(output["clusters"], np.ndarray):
             output["clusters"] = output["clusters"].tolist()
+        # Need to set missing keys from slow processor to match the expected behavior in save/load tests compared to fast processor
+        missing_keys = [
+            "image_mean", "image_std",
+            "rescale_factor", "do_rescale"
+        ]
+        for key in missing_keys:
+            if key in output:
+                output[key] = None
+        
         return output
 
 
diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
index b65ac4997e08..ffe5f915be38 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
@@ -18,11 +18,13 @@
 
 import numpy as np
 
+from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
     DefaultFastImageProcessorKwargs,
 )
-from ...image_utils import ChannelDimension, PILImageResampling
+from ...image_transforms import group_images_by_shape, reorder_images
+from ...image_utils import PILImageResampling
 from ...processing_utils import Unpack
 from ...utils import (
     TensorType,
@@ -58,8 +60,8 @@ def squared_euclidean_distance_torch(a: torch.Tensor, b: torch.Tensor) -> torch.
     a2 = torch.sum(a**2, dim=1)  # (N,)
     b2 = torch.sum(b**2, dim=0)  # (M,)
     ab = torch.matmul(a, b)  # (N, M)
-    d = a2[:, None] - 2 * ab + b2[None, :]
-    return d
+    d = a2[:, None] - 2 * ab + b2[None, :]  # Squared Euclidean Distance: a^2 - 2ab + b^2
+    return d # (N, M) tensor of squared distances
 
 
 def color_quantize_torch(x: torch.Tensor, clusters: torch.Tensor) -> torch.Tensor:
@@ -84,118 +86,80 @@ class ImageGPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
         in `preprocess`.
     resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
         Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`.
-    return_tensors (`str` or `TensorType`, *optional*):
-        The type of tensors to return. Can be one of:
-            - Unset: Return a list of `torch.Tensor`.
-            - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
-            - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
-            - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
-    data_format (`ChannelDimension` or `str`, *optional*):
-        The channel dimension format for the output image. If unset, the channel dimension format of the input
-        image is used. Can be one of:
-            - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-            - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-            - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
-    input_data_format (`ChannelDimension` or `str`, *optional*):
-        The channel dimension format for the input image. If unset, the channel dimension format is inferred
-        from the input image. Can be one of:
-            - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-            - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-            - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
-    do_color_quantize (`bool`, *optional*, defaults to `self.do_color_quantize`):
-        Whether to color quantize the image.
     """
 
     clusters: Optional[np.ndarray] = None
     resample: Optional[PILImageResampling] = PILImageResampling.BILINEAR
-    return_tensors: Optional[Union[str, TensorType]] = (None,)
-    data_format: Optional[Union[str, ChannelDimension]] = (ChannelDimension.FIRST,)
-    input_data_format: Optional[Union[str, ChannelDimension]] = None
     do_color_quantize: Optional[bool] = True
 
 
 @auto_docstring
 class ImageGPTImageProcessorFast(BaseImageProcessorFast):
-    """
-    Constructs a fast ImageGPT image processor.
-
-    This processor can be used to resize images to a smaller resolution (such as 32x32 or 64x64),
-    normalize them and finally color quantize them to obtain sequences of "pixel values" (color clusters).
-    """
-
     model_input_names = ["input_ids"]
-
-    # Defaults largely aligned with the slow processor, except normalization which we do manually to [-1, 1]
     resample = PILImageResampling.BILINEAR
-    size = {"height": 256, "width": 256}
-    do_resize = True
-    # We do NOT want touse the base normalization/rescale as ImageGPT expects (x/127.5 - 1)
-    do_rescale = False
-    do_normalize = False
-
     do_color_quantize = True
     clusters = None
 
+    # Use standard normalization with image_mean=[0.5, 0.5, 0.5] and image_std=[0.5, 0.5, 0.5]
+    # This is equivalent to ImageGPT's (x/127.5 - 1) normalization
+    image_mean = [0.5, 0.5, 0.5]
+    image_std = [0.5, 0.5, 0.5]
+    do_rescale = True
+    do_normalize = True
+
+    # We are keeping this for backwards compatibility
     def __init__(
         self,
         clusters: Optional[Union[list, np.ndarray]] = None,
         **kwargs: Unpack[ImageGPTFastImageProcessorKwargs],
     ):
         super().__init__(**kwargs)
-        # Store clusters as numpy for JSON serializability. Convert to torch in _preprocess when needed.
-        if clusters is not None:
-            self.clusters = np.asarray(clusters, dtype=np.float32)
-        else:
-            self.clusters = None
-        # Default: follow ImageGPT behavior (normalize by default). We stash here and force base to skip.
-        self._do_normalize_imagegpt = kwargs.get("do_normalize", True)
-
-    def _further_process_kwargs(self, **kwargs):
-        # Let the base process size/crop and other standard kwargs first
-        kwargs = super()._further_process_kwargs(**kwargs)
-        if "do_normalize" in kwargs and kwargs["do_normalize"] is not None:
-            self._do_normalize_imagegpt = kwargs["do_normalize"]
-        # Force base pipeline to skip its rescale/normalize validation and logic
-        kwargs["do_rescale"] = False
-        kwargs["do_normalize"] = False
-        return kwargs
-
+        # Store clusters as torch tensor directly for efficiency
+        self.clusters = torch.tensor(clusters, dtype=torch.float32) if clusters is not None else None
     def _preprocess(
         self,
-        images,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: dict[str, int],
+        interpolation: Optional["F.InterpolationMode"],
+        do_center_crop: bool,
+        crop_size: dict[str, int],
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
         do_color_quantize: Optional[bool] = None,
         clusters: Optional[Union[list, np.ndarray, torch.Tensor]] = None,
-        return_tensors: Optional[str] = None,
-        **kwargs: Unpack[ImageGPTFastImageProcessorKwargs],
+        disable_grouping: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
     ):
-        """
-        Preprocess an image or batch of images.
-
-        Args:
-            clusters (`np.ndarray`, `list[list[float]]`, or `torch.Tensor`, *optional*, defaults to `self.clusters`):
-                Clusters used to quantize the image of shape `(n_clusters, 3)`. Only has an effect if
-                `do_color_quantize` is set to `True`.
-            return_tensors (`str` or `TensorType`, *optional*):
-                The type of tensors to return. Can be one of:
-                    - Unset: Return a list of `torch.Tensor`.
-                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
-                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
-                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
-        """
-        # Run standard fast pipeline (resize, crop, batching) without rescale/normalize
-        base_batch = super()._preprocess(images, return_tensors=return_tensors, **kwargs)
-        pixel_values = base_batch["pixel_values"]
-
-        # Apply ImageGPT normalization when requested: [-1, 1]
-        do_normalize = getattr(self, "_do_normalize_imagegpt", True)
-        if isinstance(pixel_values, torch.Tensor):
-            normalized = pixel_values.to(dtype=torch.float32)
-            if do_normalize:
-                normalized = F.normalize(normalized, mean=[0.0], std=[127.5]) - 1.0
-        else:
-            normalized = [img.to(dtype=torch.float32) for img in pixel_values]
-            if do_normalize:
-                normalized = [F.normalize(img, mean=[0.0], std=[127.5]) - 1.0 for img in normalized]
+        # Unrolled standard image processing pipeline for clarity
+
+        # Group images by size for batched resizing
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation)
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_center_crop:
+                stacked_images = self.center_crop(stacked_images, crop_size)
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_images_grouped[shape] = stacked_images
+
+        pixel_values = reorder_images(processed_images_grouped, grouped_images_index)
 
         # If color quantization is requested, perform it; otherwise return pixel values
         do_color_quantize = do_color_quantize if do_color_quantize is not None else self.do_color_quantize
@@ -204,47 +168,100 @@ def _preprocess(
             clusters = clusters if clusters is not None else self.clusters
             if clusters is None:
                 raise ValueError("Clusters must be provided for color quantization.")
-            clusters_torch = torch.as_tensor(clusters, dtype=torch.float32)
-
-            def _quantize_one_image(image_chw: torch.Tensor, clusters_ref: torch.Tensor) -> torch.Tensor:
-                device_clusters = clusters_ref.to(image_chw.device, dtype=image_chw.dtype)
-                img_hwc = image_chw.permute(1, 2, 0)
-                pixels = img_hwc.reshape(-1, 3)
-                return color_quantize_torch(pixels, device_clusters)
-
-            images_list = list(normalized)
-
-            ids_list = [_quantize_one_image(img, clusters_torch) for img in images_list]
-
-            if return_tensors == "pt":
-                input_ids = torch.stack(ids_list, dim=0)
-                pixel_values_out = torch.stack(images_list, dim=0)
+            # Convert to torch tensor if needed (clusters might be passed as list/numpy)
+            clusters_torch = torch.as_tensor(clusters, dtype=torch.float32) if not isinstance(clusters, torch.Tensor) else clusters
+
+            # Group images by shape for batch processing
+            # We need to check if the pixel values are a tensor or a list of tensors
+            if isinstance(pixel_values, torch.Tensor):
+                # Single batch case
+                images_list = [pixel_values]
             else:
-                input_ids = ids_list
-                pixel_values_out = images_list
-
-            from ...image_processing_utils import BatchFeature
-
+                # Multiple images case, we group by shape
+                shape_groups = {}
+                for i, img in enumerate(pixel_values):
+                    shape = img.shape
+                    if shape not in shape_groups:
+                        shape_groups[shape] = []
+                    shape_groups[shape].append((i, img))
+
+                images_list = []
+                for shape, group in shape_groups.items():
+                    if len(group) > 1:
+                        # Batch process images of same shape
+                        batch_imgs = torch.stack([img for _, img in group])
+                        images_list.append((batch_imgs, [idx for idx, _ in group]))
+                    else:
+                        # Single image
+                        idx, img = group[0]
+                        images_list.append((img.unsqueeze(0), [idx]))
+
+            # Process each group
+            all_input_ids = [None] * len(pixel_values) if not isinstance(pixel_values, torch.Tensor) else None
+
+            for group_data in images_list:
+                if isinstance(pixel_values, torch.Tensor):
+                    # Single batch case
+                    batch_imgs = group_data
+                    batch_size = batch_imgs.shape[0]
+                    # Convert from CHW to HWC and flatten
+                    batch_hwc = batch_imgs.permute(0, 2, 3, 1)  # (B, H, W, C)
+                    batch_flat = batch_hwc.reshape(batch_size, -1, 3)  # (B, H*W, C)
+
+                    # Quantize each image in the batch
+                    device_clusters = clusters_torch.to(batch_flat.device, dtype=batch_flat.dtype)
+                    input_ids = color_quantize_torch(batch_flat.reshape(-1, 3), device_clusters)
+                    input_ids = input_ids.reshape(batch_size, -1)  # (B, H*W)
+
+                    return BatchFeature(
+                        data={"input_ids": input_ids},
+                        tensor_type=return_tensors,
+                    )
+                else:
+                    # Multiple images case
+                    batch_imgs, indices = group_data
+                    if batch_imgs.shape[0] == 1:
+                        # Single image
+                        img = batch_imgs.squeeze(0)
+                        img_hwc = img.permute(1, 2, 0)  # (H, W, C)
+                        pixels = img_hwc.reshape(-1, 3)  # (H*W, C)
+
+                        device_clusters = clusters_torch.to(pixels.device, dtype=pixels.dtype)
+                        input_ids = color_quantize_torch(pixels, device_clusters)
+
+                        all_input_ids[indices[0]] = input_ids
+                    else:
+                        # Batch of same shape
+                        batch_hwc = batch_imgs.permute(0, 2, 3, 1)  # (B, H, W, C)
+                        batch_flat = batch_hwc.reshape(batch_imgs.shape[0], -1, 3)  # (B, H*W, C)
+
+                        device_clusters = clusters_torch.to(batch_flat.device, dtype=batch_flat.dtype)
+                        input_ids = color_quantize_torch(batch_flat.reshape(-1, 3), device_clusters)
+                        input_ids = input_ids.reshape(batch_imgs.shape[0], -1)  # (B, H*W)
+
+                        for i, idx in enumerate(indices):
+                            all_input_ids[idx] = input_ids[i]
+
+            # Stack input_ids if returning tensors
+            if return_tensors:
+                all_input_ids = torch.stack(all_input_ids, dim=0)
             return BatchFeature(
-                data={"input_ids": input_ids, "pixel_values": pixel_values_out},
+                data={"input_ids": all_input_ids},
                 tensor_type=return_tensors,
             )
 
-        # Otherwise, return pixel values (normalized or not depending on flag)
-        base_batch["pixel_values"] = normalized
-        return base_batch
+        # Otherwise, return normalized pixel values
+        pixel_values = torch.stack(pixel_values, dim=0) if return_tensors else pixel_values
+        return BatchFeature(data={"pixel_values": pixel_values}, tensor_type=return_tensors)
 
     def to_dict(self):
-        # Convert numpy arrays to lists for JSON serialization
+        # Convert torch tensors to lists for JSON serialization
         output = super().to_dict()
-        if output.get("clusters") is not None and isinstance(output["clusters"], np.ndarray):
+        if output.get("clusters") is not None and isinstance(output["clusters"], torch.Tensor):
             output["clusters"] = output["clusters"].tolist()
 
-        # Need to set these valus to match with slow processor during testing
-        output["rescale_factor"] = None
-        output["do_rescale"] = None
-        output["do_color_quantize"] = bool(getattr(self, "do_color_quantize", True))
-        output.pop("_do_normalize_imagegpt", None)
+        # Ensure we match the slow processor's configuration
+        output["do_color_quantize"] = True
         return output
 
 
diff --git a/tests/models/imagegpt/test_image_processing_imagegpt.py b/tests/models/imagegpt/test_image_processing_imagegpt.py
index 12a83b25c1d8..21ec7e7f0a8b 100644
--- a/tests/models/imagegpt/test_image_processing_imagegpt.py
+++ b/tests/models/imagegpt/test_image_processing_imagegpt.py
@@ -256,6 +256,59 @@ def test_call_pytorch(self):
                 (self.image_processor_tester.batch_size, *expected_output_image_shape),
             )
 
+    def test_slow_fast_equivalence(self):
+        if self.fast_image_processing_class is None:
+            self.skipTest("Fast image processing class not available")
+
+        for image_processing_class in self.image_processor_list:
+            # Initialize image_processing
+            image_processing = image_processing_class(**self.image_processor_dict)
+            # create random numpy tensors
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+            for image in image_inputs:
+                self.assertIsInstance(image, np.ndarray)
+
+            # Test not batched input
+            encoding_slow = self.image_processing_class(**self.image_processor_dict)(image_inputs[0], return_tensors="pt")
+            encoding_fast = self.fast_image_processing_class(**self.image_processor_dict)(image_inputs[0], return_tensors="pt")
+            # Convert to float for mean calculation since input_ids are integers
+            slow_tensor = encoding_slow.input_ids.float()
+            fast_tensor = encoding_fast.input_ids.float()
+            # For quantization-based processors, use absolute tolerance only to avoid infinity issues
+            # when one value is 0 and the other is 1. The rtol=0 prevents relative tolerance calculation.
+            self._assert_slow_fast_tensors_equivalence(slow_tensor, fast_tensor, atol=1.0, rtol=0)
+
+            # Test batched
+            encoding_slow = self.image_processing_class(**self.image_processor_dict)(image_inputs, return_tensors="pt")
+            encoding_fast = self.fast_image_processing_class(**self.image_processor_dict)(image_inputs, return_tensors="pt")
+            # Convert to float for mean calculation since input_ids are integers
+            slow_tensor = encoding_slow.input_ids.float()
+            fast_tensor = encoding_fast.input_ids.float()
+            # Once again using absolute tolerance only to avoid infinity issues
+            self._assert_slow_fast_tensors_equivalence(slow_tensor, fast_tensor, atol=1.0, rtol=0)
+
+    def test_slow_fast_equivalence_batched(self):
+        if self.fast_image_processing_class is None:
+            self.skipTest("Fast image processing class not available")
+
+        for image_processing_class in self.image_processor_list:
+            # Initialize image_processing
+            image_processing = image_processing_class(**self.image_processor_dict)
+            # create random numpy tensors
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+            for image in image_inputs:
+                self.assertIsInstance(image, np.ndarray)
+
+            # Test batched
+            encoding_slow = self.image_processing_class(**self.image_processor_dict)(image_inputs, return_tensors="pt")
+            encoding_fast = self.fast_image_processing_class(**self.image_processor_dict)(image_inputs, return_tensors="pt")
+            # Convert to float for mean calculation since input_ids are integers
+            slow_tensor = encoding_slow.input_ids.float()
+            fast_tensor = encoding_fast.input_ids.float()
+            # For quantization-based processors, use absolute tolerance only to avoid infinity issues
+            # when one value is 0 and the other is 1. The rtol=0 prevents relative tolerance calculation.
+            self._assert_slow_fast_tensors_equivalence(slow_tensor, fast_tensor, atol=1.0, rtol=0)
+
 
 def prepare_images():
     # we use revision="refs/pr/1" until the PR is merged

From 0f34fd1b10462c001230ffba75045d22b8fc2326 Mon Sep 17 00:00:00 2001
From: agamjots <email.agamjotsingh@gmail.com>
Date: Thu, 21 Aug 2025 23:25:39 -0700
Subject: [PATCH 30/33] fixed ruff checks

---
 .../models/imagegpt/image_processing_imagegpt.py           | 7 +------
 tests/models/imagegpt/test_image_processing_imagegpt.py    | 4 ----
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt.py b/src/transformers/models/imagegpt/image_processing_imagegpt.py
index c29e436f17fa..f479026a4527 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt.py
@@ -279,11 +279,6 @@ def preprocess(
         if do_normalize:
             images = [self.normalize(image=image, input_data_format=input_data_format) for image in images]
 
-        # Need pixel_values (normalized, channels_first) for equivalence tests
-        pixel_values = [
-            to_channel_dimension_format(image, ChannelDimension.FIRST, input_channel_dim=input_data_format)
-            for image in images
-        ]
 
         if do_color_quantize:
             images = [to_channel_dimension_format(image, ChannelDimension.LAST, input_data_format) for image in images]
@@ -319,7 +314,7 @@ def to_dict(self):
         for key in missing_keys:
             if key in output:
                 output[key] = None
-        
+
         return output
 
 
diff --git a/tests/models/imagegpt/test_image_processing_imagegpt.py b/tests/models/imagegpt/test_image_processing_imagegpt.py
index 21ec7e7f0a8b..998bb0d665bb 100644
--- a/tests/models/imagegpt/test_image_processing_imagegpt.py
+++ b/tests/models/imagegpt/test_image_processing_imagegpt.py
@@ -261,8 +261,6 @@ def test_slow_fast_equivalence(self):
             self.skipTest("Fast image processing class not available")
 
         for image_processing_class in self.image_processor_list:
-            # Initialize image_processing
-            image_processing = image_processing_class(**self.image_processor_dict)
             # create random numpy tensors
             image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
             for image in image_inputs:
@@ -292,8 +290,6 @@ def test_slow_fast_equivalence_batched(self):
             self.skipTest("Fast image processing class not available")
 
         for image_processing_class in self.image_processor_list:
-            # Initialize image_processing
-            image_processing = image_processing_class(**self.image_processor_dict)
             # create random numpy tensors
             image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
             for image in image_inputs:

From 4f34393539273c96c6202540e0ea564371880d2a Mon Sep 17 00:00:00 2001
From: agamjots <email.agamjotsingh@gmail.com>
Date: Thu, 21 Aug 2025 23:30:10 -0700
Subject: [PATCH 31/33] fixed formatting issues

---
 .../models/imagegpt/image_processing_imagegpt.py | 11 ++---------
 .../imagegpt/image_processing_imagegpt_fast.py   |  7 +++++--
 .../imagegpt/test_image_processing_imagegpt.py   | 16 ++++++++++++----
 3 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt.py b/src/transformers/models/imagegpt/image_processing_imagegpt.py
index f479026a4527..1f2026627515 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt.py
@@ -279,7 +279,6 @@ def preprocess(
         if do_normalize:
             images = [self.normalize(image=image, input_data_format=input_data_format) for image in images]
 
-
         if do_color_quantize:
             images = [to_channel_dimension_format(image, ChannelDimension.LAST, input_data_format) for image in images]
             # color quantize from (batch_size, height, width, 3) to (batch_size, height, width)
@@ -294,10 +293,7 @@ def preprocess(
             images = list(images)
             data = {"input_ids": images}
         else:
-            images = [
-                to_channel_dimension_format(image, data_format, input_data_format)
-                for image in images
-            ]
+            images = [to_channel_dimension_format(image, data_format, input_data_format) for image in images]
             data = {"pixel_values": images}
         return BatchFeature(data=data, tensor_type=return_tensors)
 
@@ -307,10 +303,7 @@ def to_dict(self):
         if output.get("clusters") is not None and isinstance(output["clusters"], np.ndarray):
             output["clusters"] = output["clusters"].tolist()
         # Need to set missing keys from slow processor to match the expected behavior in save/load tests compared to fast processor
-        missing_keys = [
-            "image_mean", "image_std",
-            "rescale_factor", "do_rescale"
-        ]
+        missing_keys = ["image_mean", "image_std", "rescale_factor", "do_rescale"]
         for key in missing_keys:
             if key in output:
                 output[key] = None
diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
index ffe5f915be38..53d32ffd1e70 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
@@ -61,7 +61,7 @@ def squared_euclidean_distance_torch(a: torch.Tensor, b: torch.Tensor) -> torch.
     b2 = torch.sum(b**2, dim=0)  # (M,)
     ab = torch.matmul(a, b)  # (N, M)
     d = a2[:, None] - 2 * ab + b2[None, :]  # Squared Euclidean Distance: a^2 - 2ab + b^2
-    return d # (N, M) tensor of squared distances
+    return d  # (N, M) tensor of squared distances
 
 
 def color_quantize_torch(x: torch.Tensor, clusters: torch.Tensor) -> torch.Tensor:
@@ -116,6 +116,7 @@ def __init__(
         super().__init__(**kwargs)
         # Store clusters as torch tensor directly for efficiency
         self.clusters = torch.tensor(clusters, dtype=torch.float32) if clusters is not None else None
+
     def _preprocess(
         self,
         images: list["torch.Tensor"],
@@ -169,7 +170,9 @@ def _preprocess(
             if clusters is None:
                 raise ValueError("Clusters must be provided for color quantization.")
             # Convert to torch tensor if needed (clusters might be passed as list/numpy)
-            clusters_torch = torch.as_tensor(clusters, dtype=torch.float32) if not isinstance(clusters, torch.Tensor) else clusters
+            clusters_torch = (
+                torch.as_tensor(clusters, dtype=torch.float32) if not isinstance(clusters, torch.Tensor) else clusters
+            )
 
             # Group images by shape for batch processing
             # We need to check if the pixel values are a tensor or a list of tensors
diff --git a/tests/models/imagegpt/test_image_processing_imagegpt.py b/tests/models/imagegpt/test_image_processing_imagegpt.py
index 998bb0d665bb..713325b1dc20 100644
--- a/tests/models/imagegpt/test_image_processing_imagegpt.py
+++ b/tests/models/imagegpt/test_image_processing_imagegpt.py
@@ -267,8 +267,12 @@ def test_slow_fast_equivalence(self):
                 self.assertIsInstance(image, np.ndarray)
 
             # Test not batched input
-            encoding_slow = self.image_processing_class(**self.image_processor_dict)(image_inputs[0], return_tensors="pt")
-            encoding_fast = self.fast_image_processing_class(**self.image_processor_dict)(image_inputs[0], return_tensors="pt")
+            encoding_slow = self.image_processing_class(**self.image_processor_dict)(
+                image_inputs[0], return_tensors="pt"
+            )
+            encoding_fast = self.fast_image_processing_class(**self.image_processor_dict)(
+                image_inputs[0], return_tensors="pt"
+            )
             # Convert to float for mean calculation since input_ids are integers
             slow_tensor = encoding_slow.input_ids.float()
             fast_tensor = encoding_fast.input_ids.float()
@@ -278,7 +282,9 @@ def test_slow_fast_equivalence(self):
 
             # Test batched
             encoding_slow = self.image_processing_class(**self.image_processor_dict)(image_inputs, return_tensors="pt")
-            encoding_fast = self.fast_image_processing_class(**self.image_processor_dict)(image_inputs, return_tensors="pt")
+            encoding_fast = self.fast_image_processing_class(**self.image_processor_dict)(
+                image_inputs, return_tensors="pt"
+            )
             # Convert to float for mean calculation since input_ids are integers
             slow_tensor = encoding_slow.input_ids.float()
             fast_tensor = encoding_fast.input_ids.float()
@@ -297,7 +303,9 @@ def test_slow_fast_equivalence_batched(self):
 
             # Test batched
             encoding_slow = self.image_processing_class(**self.image_processor_dict)(image_inputs, return_tensors="pt")
-            encoding_fast = self.fast_image_processing_class(**self.image_processor_dict)(image_inputs, return_tensors="pt")
+            encoding_fast = self.fast_image_processing_class(**self.image_processor_dict)(
+                image_inputs, return_tensors="pt"
+            )
             # Convert to float for mean calculation since input_ids are integers
             slow_tensor = encoding_slow.input_ids.float()
             fast_tensor = encoding_fast.input_ids.float()

From 898a80768da87d7e45668477ec0ca6713ee938a3 Mon Sep 17 00:00:00 2001
From: agamjots <email.agamjotsingh@gmail.com>
Date: Mon, 25 Aug 2025 21:32:06 -0700
Subject: [PATCH 32/33] fix(ruff after merging main)

---
 tests/models/imagegpt/test_image_processing_imagegpt.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/models/imagegpt/test_image_processing_imagegpt.py b/tests/models/imagegpt/test_image_processing_imagegpt.py
index d64b6dde3cd0..90ed10aee5df 100644
--- a/tests/models/imagegpt/test_image_processing_imagegpt.py
+++ b/tests/models/imagegpt/test_image_processing_imagegpt.py
@@ -142,7 +142,6 @@ def test_image_processor_to_json_file(self):
                 image_processor_first.to_json_file(json_file_path)
                 image_processor_second = image_processing_class.from_json_file(json_file_path).to_dict()
 
-
             image_processor_first = image_processor_first.to_dict()
             for key, value in image_processor_first.items():
                 if key == "clusters":
@@ -150,7 +149,6 @@ def test_image_processor_to_json_file(self):
                 else:
                     self.assertEqual(image_processor_first[key], value)
 
-
     def test_image_processor_from_and_save_pretrained(self):
         for image_processing_class in self.image_processor_list:
             image_processor_first = image_processing_class(**self.image_processor_dict)

From f3815ce15f681d87225ae6f02845b6fdf39d27f2 Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Thu, 4 Sep 2025 22:35:22 +0000
Subject: [PATCH 33/33] simplify logic and reuse standard equivalenec tests

---
 .../imagegpt/image_processing_imagegpt.py     |   6 +-
 .../image_processing_imagegpt_fast.py         | 126 +++++-------------
 .../test_image_processing_imagegpt.py         | 117 +++++++++-------
 3 files changed, 104 insertions(+), 145 deletions(-)

diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt.py b/src/transformers/models/imagegpt/image_processing_imagegpt.py
index 66f4f28f083e..1f2026627515 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt.py
@@ -26,7 +26,7 @@
     PILImageResampling,
     infer_channel_dimension_format,
     is_scaled_image,
-    make_flat_list_of_images,
+    make_list_of_images,
     to_numpy_array,
     valid_images,
     validate_preprocess_arguments,
@@ -238,7 +238,7 @@ def preprocess(
         clusters = clusters if clusters is not None else self.clusters
         clusters = np.array(clusters)
 
-        images = make_flat_list_of_images(images)
+        images = make_list_of_images(images)
 
         if not valid_images(images):
             raise ValueError(
@@ -247,7 +247,7 @@ def preprocess(
             )
 
         # Here, normalize() is using a constant factor to divide pixel values.
-        # hence, the method does not need image_mean and image_std.
+        # hence, the method does not need iamge_mean and image_std.
         validate_preprocess_arguments(
             do_resize=do_resize,
             size=size,
diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
index 53d32ffd1e70..736666fd28a0 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
@@ -81,16 +81,16 @@ def color_quantize_torch(x: torch.Tensor, clusters: torch.Tensor) -> torch.Tenso
 
 class ImageGPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     """
-    clusters (`np.ndarray` or `list[list[int]]`, *optional*):
+    clusters (`np.ndarray` or `list[list[int]]` or `torch.Tensor`, *optional*):
         The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters`
         in `preprocess`.
-    resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
-        Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`.
+    do_color_quantize (`bool`, *optional*, defaults to `True`):
+        Controls whether to apply color quantization to convert continuous pixel values to discrete cluster indices.
+        When True, each pixel is assigned to its nearest color cluster, enabling ImageGPT's discrete token modeling.
     """
 
-    clusters: Optional[np.ndarray] = None
-    resample: Optional[PILImageResampling] = PILImageResampling.BILINEAR
-    do_color_quantize: Optional[bool] = True
+    clusters: Optional[Union[np.ndarray, list[list[int]], torch.Tensor]]
+    do_color_quantize: Optional[bool]
 
 
 @auto_docstring
@@ -99,23 +99,24 @@ class ImageGPTImageProcessorFast(BaseImageProcessorFast):
     resample = PILImageResampling.BILINEAR
     do_color_quantize = True
     clusters = None
-
-    # Use standard normalization with image_mean=[0.5, 0.5, 0.5] and image_std=[0.5, 0.5, 0.5]
-    # This is equivalent to ImageGPT's (x/127.5 - 1) normalization
     image_mean = [0.5, 0.5, 0.5]
     image_std = [0.5, 0.5, 0.5]
     do_rescale = True
     do_normalize = True
+    valid_kwargs = ImageGPTFastImageProcessorKwargs
 
-    # We are keeping this for backwards compatibility
     def __init__(
         self,
-        clusters: Optional[Union[list, np.ndarray]] = None,
+        clusters: Optional[Union[list, np.ndarray, torch.Tensor]] = None,  # keep as arg for backwards compatibility
         **kwargs: Unpack[ImageGPTFastImageProcessorKwargs],
     ):
-        super().__init__(**kwargs)
-        # Store clusters as torch tensor directly for efficiency
-        self.clusters = torch.tensor(clusters, dtype=torch.float32) if clusters is not None else None
+        r"""
+        clusters (`np.ndarray` or `list[list[int]]` or `torch.Tensor`, *optional*):
+            The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters`
+            in `preprocess`.
+        """
+        clusters = torch.as_tensor(clusters, dtype=torch.float32) if clusters is not None else None
+        super().__init__(clusters=clusters, **kwargs)
 
     def _preprocess(
         self,
@@ -136,8 +137,6 @@ def _preprocess(
         return_tensors: Optional[Union[str, TensorType]] = None,
         **kwargs,
     ):
-        # Unrolled standard image processing pipeline for clarity
-
         # Group images by size for batched resizing
         grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
         resized_images_grouped = {}
@@ -163,97 +162,38 @@ def _preprocess(
         pixel_values = reorder_images(processed_images_grouped, grouped_images_index)
 
         # If color quantization is requested, perform it; otherwise return pixel values
-        do_color_quantize = do_color_quantize if do_color_quantize is not None else self.do_color_quantize
         if do_color_quantize:
             # Prepare clusters
-            clusters = clusters if clusters is not None else self.clusters
             if clusters is None:
                 raise ValueError("Clusters must be provided for color quantization.")
             # Convert to torch tensor if needed (clusters might be passed as list/numpy)
             clusters_torch = (
                 torch.as_tensor(clusters, dtype=torch.float32) if not isinstance(clusters, torch.Tensor) else clusters
-            )
+            ).to(pixel_values[0].device, dtype=pixel_values[0].dtype)
 
             # Group images by shape for batch processing
             # We need to check if the pixel values are a tensor or a list of tensors
-            if isinstance(pixel_values, torch.Tensor):
-                # Single batch case
-                images_list = [pixel_values]
-            else:
-                # Multiple images case, we group by shape
-                shape_groups = {}
-                for i, img in enumerate(pixel_values):
-                    shape = img.shape
-                    if shape not in shape_groups:
-                        shape_groups[shape] = []
-                    shape_groups[shape].append((i, img))
-
-                images_list = []
-                for shape, group in shape_groups.items():
-                    if len(group) > 1:
-                        # Batch process images of same shape
-                        batch_imgs = torch.stack([img for _, img in group])
-                        images_list.append((batch_imgs, [idx for idx, _ in group]))
-                    else:
-                        # Single image
-                        idx, img = group[0]
-                        images_list.append((img.unsqueeze(0), [idx]))
-
+            grouped_images, grouped_images_index = group_images_by_shape(
+                pixel_values, disable_grouping=disable_grouping
+            )
             # Process each group
-            all_input_ids = [None] * len(pixel_values) if not isinstance(pixel_values, torch.Tensor) else None
-
-            for group_data in images_list:
-                if isinstance(pixel_values, torch.Tensor):
-                    # Single batch case
-                    batch_imgs = group_data
-                    batch_size = batch_imgs.shape[0]
-                    # Convert from CHW to HWC and flatten
-                    batch_hwc = batch_imgs.permute(0, 2, 3, 1)  # (B, H, W, C)
-                    batch_flat = batch_hwc.reshape(batch_size, -1, 3)  # (B, H*W, C)
-
-                    # Quantize each image in the batch
-                    device_clusters = clusters_torch.to(batch_flat.device, dtype=batch_flat.dtype)
-                    input_ids = color_quantize_torch(batch_flat.reshape(-1, 3), device_clusters)
-                    input_ids = input_ids.reshape(batch_size, -1)  # (B, H*W)
-
-                    return BatchFeature(
-                        data={"input_ids": input_ids},
-                        tensor_type=return_tensors,
-                    )
-                else:
-                    # Multiple images case
-                    batch_imgs, indices = group_data
-                    if batch_imgs.shape[0] == 1:
-                        # Single image
-                        img = batch_imgs.squeeze(0)
-                        img_hwc = img.permute(1, 2, 0)  # (H, W, C)
-                        pixels = img_hwc.reshape(-1, 3)  # (H*W, C)
-
-                        device_clusters = clusters_torch.to(pixels.device, dtype=pixels.dtype)
-                        input_ids = color_quantize_torch(pixels, device_clusters)
-
-                        all_input_ids[indices[0]] = input_ids
-                    else:
-                        # Batch of same shape
-                        batch_hwc = batch_imgs.permute(0, 2, 3, 1)  # (B, H, W, C)
-                        batch_flat = batch_hwc.reshape(batch_imgs.shape[0], -1, 3)  # (B, H*W, C)
-
-                        device_clusters = clusters_torch.to(batch_flat.device, dtype=batch_flat.dtype)
-                        input_ids = color_quantize_torch(batch_flat.reshape(-1, 3), device_clusters)
-                        input_ids = input_ids.reshape(batch_imgs.shape[0], -1)  # (B, H*W)
-
-                        for i, idx in enumerate(indices):
-                            all_input_ids[idx] = input_ids[i]
-
-            # Stack input_ids if returning tensors
-            if return_tensors:
-                all_input_ids = torch.stack(all_input_ids, dim=0)
+            input_ids_grouped = {}
+
+            for shape, stacked_images in grouped_images.items():
+                input_ids = color_quantize_torch(
+                    stacked_images.permute(0, 2, 3, 1).reshape(-1, 3), clusters_torch
+                )  # (B*H*W, C)
+                input_ids_grouped[shape] = input_ids.reshape(stacked_images.shape[0], -1).reshape(
+                    stacked_images.shape[0], -1
+                )  # (B, H, W)
+
+            input_ids = reorder_images(input_ids_grouped, grouped_images_index)
+
             return BatchFeature(
-                data={"input_ids": all_input_ids},
+                data={"input_ids": torch.stack(input_ids, dim=0) if return_tensors else input_ids},
                 tensor_type=return_tensors,
             )
 
-        # Otherwise, return normalized pixel values
         pixel_values = torch.stack(pixel_values, dim=0) if return_tensors else pixel_values
         return BatchFeature(data={"pixel_values": pixel_values}, tensor_type=return_tensors)
 
@@ -263,8 +203,6 @@ def to_dict(self):
         if output.get("clusters") is not None and isinstance(output["clusters"], torch.Tensor):
             output["clusters"] = output["clusters"].tolist()
 
-        # Ensure we match the slow processor's configuration
-        output["do_color_quantize"] = True
         return output
 
 
diff --git a/tests/models/imagegpt/test_image_processing_imagegpt.py b/tests/models/imagegpt/test_image_processing_imagegpt.py
index 90ed10aee5df..8c04d9585022 100644
--- a/tests/models/imagegpt/test_image_processing_imagegpt.py
+++ b/tests/models/imagegpt/test_image_processing_imagegpt.py
@@ -19,10 +19,20 @@
 import unittest
 
 import numpy as np
+import pytest
+import requests
 from datasets import load_dataset
+from packaging import version
 
 from transformers import AutoImageProcessor
-from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_vision, slow
+from transformers.testing_utils import (
+    check_json_file_has_correct_format,
+    require_torch,
+    require_torch_accelerator,
+    require_vision,
+    slow,
+    torch_device,
+)
 from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
 
 from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
@@ -256,62 +266,73 @@ def test_call_pytorch(self):
                 (self.image_processor_tester.batch_size, *expected_output_image_shape),
             )
 
+    # For quantization-based processors, use absolute tolerance only to avoid infinity issues
+    @require_vision
+    @require_torch
     def test_slow_fast_equivalence(self):
-        if self.fast_image_processing_class is None:
-            self.skipTest("Fast image processing class not available")
+        if not self.test_slow_image_processor or not self.test_fast_image_processor:
+            self.skipTest(reason="Skipping slow/fast equivalence test")
 
-        for image_processing_class in self.image_processor_list:
-            # create random numpy tensors
-            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
-            for image in image_inputs:
-                self.assertIsInstance(image, np.ndarray)
+        if self.image_processing_class is None or self.fast_image_processing_class is None:
+            self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
 
-            # Test not batched input
-            encoding_slow = self.image_processing_class(**self.image_processor_dict)(
-                image_inputs[0], return_tensors="pt"
-            )
-            encoding_fast = self.fast_image_processing_class(**self.image_processor_dict)(
-                image_inputs[0], return_tensors="pt"
-            )
-            # Convert to float for mean calculation since input_ids are integers
-            slow_tensor = encoding_slow.input_ids.float()
-            fast_tensor = encoding_fast.input_ids.float()
-            # For quantization-based processors, use absolute tolerance only to avoid infinity issues
-            # when one value is 0 and the other is 1. The rtol=0 prevents relative tolerance calculation.
-            self._assert_slow_fast_tensors_equivalence(slow_tensor, fast_tensor, atol=1.0, rtol=0)
+        dummy_image = Image.open(
+            requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw
+        )
+        image_processor_slow = self.image_processing_class(**self.image_processor_dict)
+        image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
 
-            # Test batched
-            encoding_slow = self.image_processing_class(**self.image_processor_dict)(image_inputs, return_tensors="pt")
-            encoding_fast = self.fast_image_processing_class(**self.image_processor_dict)(
-                image_inputs, return_tensors="pt"
-            )
-            # Convert to float for mean calculation since input_ids are integers
-            slow_tensor = encoding_slow.input_ids.float()
-            fast_tensor = encoding_fast.input_ids.float()
-            # Once again using absolute tolerance only to avoid infinity issues
-            self._assert_slow_fast_tensors_equivalence(slow_tensor, fast_tensor, atol=1.0, rtol=0)
+        encoding_slow = image_processor_slow(dummy_image, return_tensors="pt")
+        encoding_fast = image_processor_fast(dummy_image, return_tensors="pt")
+        self._assert_slow_fast_tensors_equivalence(
+            encoding_slow.input_ids.float(), encoding_fast.input_ids.float(), atol=1.0, rtol=0
+        )
 
+    @require_vision
+    @require_torch
     def test_slow_fast_equivalence_batched(self):
-        if self.fast_image_processing_class is None:
-            self.skipTest("Fast image processing class not available")
+        if not self.test_slow_image_processor or not self.test_fast_image_processor:
+            self.skipTest(reason="Skipping slow/fast equivalence test")
 
-        for image_processing_class in self.image_processor_list:
-            # create random numpy tensors
-            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
-            for image in image_inputs:
-                self.assertIsInstance(image, np.ndarray)
+        if self.image_processing_class is None or self.fast_image_processing_class is None:
+            self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
 
-            # Test batched
-            encoding_slow = self.image_processing_class(**self.image_processor_dict)(image_inputs, return_tensors="pt")
-            encoding_fast = self.fast_image_processing_class(**self.image_processor_dict)(
-                image_inputs, return_tensors="pt"
+        if hasattr(self.image_processor_tester, "do_center_crop") and self.image_processor_tester.do_center_crop:
+            self.skipTest(
+                reason="Skipping as do_center_crop is True and center_crop functions are not equivalent for fast and slow processors"
             )
-            # Convert to float for mean calculation since input_ids are integers
-            slow_tensor = encoding_slow.input_ids.float()
-            fast_tensor = encoding_fast.input_ids.float()
-            # For quantization-based processors, use absolute tolerance only to avoid infinity issues
-            # when one value is 0 and the other is 1. The rtol=0 prevents relative tolerance calculation.
-            self._assert_slow_fast_tensors_equivalence(slow_tensor, fast_tensor, atol=1.0, rtol=0)
+
+        dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+        image_processor_slow = self.image_processing_class(**self.image_processor_dict)
+        image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
+
+        encoding_slow = image_processor_slow(dummy_images, return_tensors="pt")
+        encoding_fast = image_processor_fast(dummy_images, return_tensors="pt")
+
+        self._assert_slow_fast_tensors_equivalence(
+            encoding_slow.input_ids.float(), encoding_fast.input_ids.float(), atol=1.0, rtol=0
+        )
+
+    @slow
+    @require_torch_accelerator
+    @require_vision
+    @pytest.mark.torch_compile_test
+    def test_can_compile_fast_image_processor(self):
+        if self.fast_image_processing_class is None:
+            self.skipTest("Skipping compilation test as fast image processor is not defined")
+        if version.parse(torch.__version__) < version.parse("2.3"):
+            self.skipTest(reason="This test requires torch >= 2.3 to run.")
+
+        torch.compiler.reset()
+        input_image = torch.randint(0, 255, (3, 224, 224), dtype=torch.uint8)
+        image_processor = self.fast_image_processing_class(**self.image_processor_dict)
+        output_eager = image_processor(input_image, device=torch_device, return_tensors="pt")
+
+        image_processor = torch.compile(image_processor, mode="reduce-overhead")
+        output_compiled = image_processor(input_image, device=torch_device, return_tensors="pt")
+        self._assert_slow_fast_tensors_equivalence(
+            output_eager.input_ids.float(), output_compiled.input_ids.float(), atol=1.0, rtol=0
+        )
 
 
 def prepare_images():