From 69ac401c42da8cde87e7247fc9a100aa7502434d Mon Sep 17 00:00:00 2001
From: Joshua Lochner <26504141+xenova@users.noreply.github.com>
Date: Sun, 15 Mar 2026 15:52:01 -0400
Subject: [PATCH 1/5] Create int_div_ceil helper function

---
 src/transformers/utils/__init__.py |  1 +
 src/transformers/utils/generic.py  | 13 +++++++++++++
 2 files changed, 14 insertions(+)

diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 699d28c7ff04..c6b34a22edb5 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -55,6 +55,7 @@
     filter_out_non_signature_kwargs,
     find_labels,
     flatten_dict,
+    int_div_ceil,
     is_numpy_array,
     is_tensor,
     is_timm_config_dict,
diff --git a/src/transformers/utils/generic.py b/src/transformers/utils/generic.py
index 379b23b58de6..2dd6af34eb8a 100644
--- a/src/transformers/utils/generic.py
+++ b/src/transformers/utils/generic.py
@@ -665,6 +665,19 @@ def torch_float(x):
     return x.to(torch.float32) if torch.jit.is_tracing() and isinstance(x, torch.Tensor) else int(x)
 
 
+def int_div_ceil(a: int | "torch.Tensor", b: int | "torch.Tensor") -> int:
+    """
+    Perform integer ceiling division without intermediate floating-point conversion.
+
+    Equivalent to `math.ceil(a / b)` for non-negative integers, but avoids casting operands
+    to floats, which can result in runtime-specific precision issues.
+
+    For example, `math.ceil(torch.tensor(300).cuda() / 30) == 11`,
+    while `int_div_ceil(torch.tensor(300).cuda(), 30) == 10` as expected.
+    """
+    return (a + b - 1) // b
+
+
 def filter_out_non_signature_kwargs(extra: list | None = None):
     """
     Decorator to filter out named arguments that are not in the function signature.

From e75137f59d80b87d439d0bcd8502d1e920b6b6e8 Mon Sep 17 00:00:00 2001
From: Joshua Lochner <26504141+xenova@users.noreply.github.com>
Date: Sun, 15 Mar 2026 16:16:21 -0400
Subject: [PATCH 2/5] Avoid floating point math for ceil operations

---
 src/transformers/core_model_loading.py        |  5 ++-
 .../integrations/tensor_parallel.py           | 11 +++----
 src/transformers/models/bit/modeling_bit.py   |  4 +--
 .../chmv2/image_processing_chmv2_fast.py      |  6 ++--
 .../models/dac/configuration_dac.py           |  6 ++--
 src/transformers/models/dac/modeling_dac.py   |  6 ++--
 .../models/depth_pro/modeling_depth_pro.py    |  8 ++---
 .../models/dpt/convert_dinov2_depth_to_hf.py  |  5 ++-
 .../models/dpt/image_processing_dpt.py        |  5 +--
 .../models/dpt/image_processing_dpt_fast.py   |  6 ++--
 src/transformers/models/dpt/modular_dpt.py    |  5 +--
 .../models/encodec/configuration_encodec.py   |  4 +--
 .../models/eomt/image_processing_eomt.py      |  5 ++-
 .../models/eomt/image_processing_eomt_fast.py |  4 +--
 .../configuration_falcon_mamba.py             |  6 ++--
 .../models/fuyu/image_processing_fuyu.py      | 14 ++-------
 .../models/fuyu/image_processing_fuyu_fast.py | 13 ++------
 .../models/gemma3/image_processing_gemma3.py  |  6 ++--
 .../gemma3/image_processing_gemma3_fast.py    |  5 +--
 .../feature_extraction_granite_speech.py      |  5 ++-
 .../granite_speech/modeling_granite_speech.py |  6 ++--
 .../configuration_higgs_audio_v2_tokenizer.py |  4 +--
 .../idefics3/image_processing_idefics3.py     | 31 +++++++++----------
 .../image_processing_idefics3_fast.py         | 28 ++++++++---------
 .../models/jamba/configuration_jamba.py       |  5 ++-
 .../models/layoutlmv2/modeling_layoutlmv2.py  |  6 ++--
 .../models/lfm2_vl/processing_lfm2_vl.py      |  9 +++---
 .../modeling_llava_onevision.py               |  4 +--
 .../modular_llava_onevision.py                |  4 +--
 .../processing_llava_onevision.py             |  4 +--
 .../models/mamba/configuration_mamba.py       |  5 ++-
 ...convert_mamba_ssm_checkpoint_to_pytorch.py |  5 ++-
 .../models/mamba2/configuration_mamba2.py     |  5 ++-
 .../image_processing_mask2former.py           |  6 ++--
 .../image_processing_mask2former_fast.py      |  7 ++---
 .../maskformer/image_processing_maskformer.py |  6 ++--
 .../image_processing_maskformer_fast.py       |  6 ++--
 .../metaclip_2/convert_metaclip_2_to_hf.py    |  1 -
 .../models/mimi/configuration_mimi.py         |  4 +--
 .../mllama/convert_mllama_weights_to_hf.py    |  4 +--
 .../models/mobilevit/modeling_mobilevit.py    |  6 ++--
 .../modeling_musicgen_melody.py               |  6 ++--
 .../models/nllb_moe/modeling_nllb_moe.py      |  4 +--
 .../models/pe_audio/modeling_pe_audio.py      |  5 ++-
 .../models/perceiver/modeling_perceiver.py    |  4 +--
 .../modeling_phi4_multimodal.py               |  4 +--
 .../modular_phi4_multimodal.py                |  4 +--
 .../qwen3_vl/video_processing_qwen3_vl.py     |  4 +--
 .../smolvlm/image_processing_smolvlm.py       | 31 +++++++++----------
 .../smolvlm/image_processing_smolvlm_fast.py  | 27 ++++++++--------
 .../modeling_voxtral_realtime.py              | 11 +++++--
 .../modular_voxtral_realtime.py               | 11 +++++--
 .../models/xcodec/configuration_xcodec.py     |  4 +--
 .../models/zamba/configuration_zamba.py       |  5 ++-
 src/transformers/trainer.py                   |  5 +--
 src/transformers/trainer_pt_utils.py          | 10 +++---
 src/transformers/utils/generic.py             |  2 +-
 57 files changed, 201 insertions(+), 221 deletions(-)

diff --git a/src/transformers/core_model_loading.py b/src/transformers/core_model_loading.py
index b62fc90d4cdf..3caf538c4c48 100644
--- a/src/transformers/core_model_loading.py
+++ b/src/transformers/core_model_loading.py
@@ -15,7 +15,6 @@
 
 from __future__ import annotations
 
-import math
 import os
 import re
 import traceback
@@ -33,7 +32,7 @@
 
 from .integrations.accelerate import get_device, offload_weight
 from .integrations.tensor_parallel import ALL_PARALLEL_STYLES
-from .utils import is_env_variable_true
+from .utils import int_div_ceil, is_env_variable_true
 from .utils.loading_report import LoadStateDictInfo
 from .utils.logging import get_logger, tqdm
 
@@ -354,7 +353,7 @@ def __init__(self, stack_dim: int = 0, concat_dim: int = 1):
         self.concat_dim = concat_dim
 
     def split_list_into_chunks(self, tensor_list: list[torch.Tensor], chunks: int = 2):
-        split_size = math.ceil(len(tensor_list) / chunks)  # best effort split size
+        split_size = int_div_ceil(len(tensor_list), chunks)  # best effort split size
         return [tensor_list[i * split_size : (i + 1) * split_size] for i in range(chunks)]
 
     @torch.no_grad()
diff --git a/src/transformers/integrations/tensor_parallel.py b/src/transformers/integrations/tensor_parallel.py
index f9a6d5233b0e..5b561fd41f9d 100644
--- a/src/transformers/integrations/tensor_parallel.py
+++ b/src/transformers/integrations/tensor_parallel.py
@@ -13,14 +13,13 @@
 # limitations under the License.
 from __future__ import annotations
 
-import math
 import operator
 import os
 import re
 from functools import reduce
 
 from ..distributed import DistributedConfig
-from ..utils import is_torch_greater_or_equal, logging
+from ..utils import int_div_ceil, is_torch_greater_or_equal, logging
 from ..utils.generic import GeneralInterface
 from ..utils.import_utils import is_torch_available
 
@@ -374,7 +373,7 @@ def get_tensor_shard(param, empty_param, device_mesh, rank, dim, tensor_idx: int
     elif empty_param.dim() == 3 and dim == 2 and len(param_shape) == 2:
         dim = 1
 
-    shard_size = math.ceil(param_shape[dim] / world_size)
+    shard_size = int_div_ceil(param_shape[dim], world_size)
     start = rank * shard_size
     end = min(start + shard_size, param_shape[dim])
 
@@ -723,7 +722,7 @@ def get_expected_sharded_shape(self, full_shape: tuple[int, ...] | torch.Size) -
         # Colwise shards dim -2, but 1D tensors (bias) shard on dim -1
         dim = -1 if len(shape) == 1 else -2
         dim = len(shape) + dim if dim < 0 else dim
-        shard_size = math.ceil(shape[dim] / world_size)
+        shard_size = int_div_ceil(shape[dim], world_size)
         start = self.rank * shard_size
         end = min(start + shard_size, shape[dim])
         shape[dim] = end - start
@@ -866,7 +865,7 @@ def get_expected_sharded_shape(self, full_shape: tuple[int, ...] | torch.Size) -
         shape = list(full_shape)
         dim = -1
         dim = len(shape) + dim if dim < 0 else dim
-        shard_size = math.ceil(shape[dim] / world_size)
+        shard_size = int_div_ceil(shape[dim], world_size)
         start = self.rank * shard_size
         end = min(start + shard_size, shape[dim])
         shape[dim] = end - start
@@ -996,7 +995,7 @@ def get_expected_sharded_shape(self, full_shape: tuple[int, ...] | torch.Size) -
         # 1D tensors (bias) shard on dim -1
         dim = -1 if len(shape) == 1 else self.embedding_dim_sharding
         dim = len(shape) + dim if dim < 0 else dim
-        shard_size = math.ceil(shape[dim] / world_size)
+        shard_size = int_div_ceil(shape[dim], world_size)
         start = self.rank * shard_size
         end = min(start + shard_size, shape[dim])
         shape[dim] = end - start
diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index 66b5a62234a5..2295b44db12b 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -30,7 +30,7 @@
     ImageClassifierOutputWithNoAttention,
 )
 from ...modeling_utils import PreTrainedModel
-from ...utils import auto_docstring, logging
+from ...utils import auto_docstring, int_div_ceil, logging
 from ...utils.generic import can_return_tuple
 from .configuration_bit import BitConfig
 
@@ -169,7 +169,7 @@ def __init__(self, kernel_size, stride, dilation, value=0):
         self.value = value
 
         def compute_padding(x, kernel_size, stride, dilation):
-            return max((math.ceil(x / stride) - 1) * stride + (kernel_size - 1) * dilation + 1 - x, 0)
+            return max((int_div_ceil(x, stride) - 1) * stride + (kernel_size - 1) * dilation + 1 - x, 0)
 
         self.compute_padding = compute_padding
 
diff --git a/src/transformers/models/chmv2/image_processing_chmv2_fast.py b/src/transformers/models/chmv2/image_processing_chmv2_fast.py
index 1a80e1eb4cc9..77afdf463613 100644
--- a/src/transformers/models/chmv2/image_processing_chmv2_fast.py
+++ b/src/transformers/models/chmv2/image_processing_chmv2_fast.py
@@ -31,7 +31,7 @@
 from ...image_utils import ChannelDimension, ImageInput, PILImageResampling, SizeDict, is_torch_tensor
 from ...modeling_outputs import DepthEstimatorOutput
 from ...processing_utils import Unpack
-from ...utils import TensorType, auto_docstring, requires_backends
+from ...utils import TensorType, auto_docstring, int_div_ceil, requires_backends
 from .image_processing_chmv2 import CHMv2ImageProcessorKwargs
 
 
@@ -48,7 +48,7 @@ def constrain_to_multiple_of(val, multiple, min_val=0, max_val=None):
             x = math.floor(val / multiple) * multiple
 
         if x < min_val:
-            x = math.ceil(val / multiple) * multiple
+            x = int_div_ceil(val, multiple) * multiple
 
         return x
 
@@ -313,7 +313,7 @@ def pad_image(
         height, width = image.shape[-2:]
 
         def _get_pad(size, size_divisor):
-            new_size = math.ceil(size / size_divisor) * size_divisor
+            new_size = int_div_ceil(size, size_divisor) * size_divisor
             pad_size = new_size - size
             pad_size_left = pad_size // 2
             pad_size_right = pad_size - pad_size_left
diff --git a/src/transformers/models/dac/configuration_dac.py b/src/transformers/models/dac/configuration_dac.py
index 551c1964bebc..88e8d9701574 100644
--- a/src/transformers/models/dac/configuration_dac.py
+++ b/src/transformers/models/dac/configuration_dac.py
@@ -13,12 +13,10 @@
 # limitations under the License.
 """Dac model configuration"""
 
-import math
-
 import numpy as np
 
 from ...configuration_utils import PreTrainedConfig
-from ...utils import auto_docstring, logging
+from ...utils import auto_docstring, int_div_ceil, logging
 
 
 logger = logging.get_logger(__name__)
@@ -88,7 +86,7 @@ def __init__(
     @property
     def frame_rate(self) -> int:
         hop_length = np.prod(self.upsampling_ratios)
-        return math.ceil(self.sampling_rate / hop_length)
+        return int_div_ceil(self.sampling_rate, int(hop_length))
 
 
 __all__ = ["DacConfig"]
diff --git a/src/transformers/models/dac/modeling_dac.py b/src/transformers/models/dac/modeling_dac.py
index 6ac46f78a4a6..d0be39982a2e 100644
--- a/src/transformers/models/dac/modeling_dac.py
+++ b/src/transformers/models/dac/modeling_dac.py
@@ -23,7 +23,7 @@
 
 from ... import initialization as init
 from ...modeling_utils import PreTrainedAudioTokenizerBase
-from ...utils import ModelOutput, auto_docstring
+from ...utils import ModelOutput, auto_docstring, int_div_ceil
 from .configuration_dac import DacConfig
 
 
@@ -219,7 +219,7 @@ def __init__(self, config: DacConfig, stride: int = 1, stride_index: int = 1):
         self.res_unit3 = DacResidualUnit(dimension // 2, dilation=9)
         self.snake1 = Snake1d(dimension // 2)
         self.conv1 = nn.Conv1d(
-            dimension // 2, dimension, kernel_size=2 * stride, stride=stride, padding=math.ceil(stride / 2)
+            dimension // 2, dimension, kernel_size=2 * stride, stride=stride, padding=int_div_ceil(stride, 2)
         )
 
     def forward(self, hidden_state):
@@ -245,7 +245,7 @@ def __init__(self, config: DacConfig, stride: int = 1, stride_index: int = 1):
             output_dim,
             kernel_size=2 * stride,
             stride=stride,
-            padding=math.ceil(stride / 2),
+            padding=int_div_ceil(stride, 2),
         )
 
         self.res_unit1 = DacResidualUnit(output_dim, dilation=1)
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index dfa9fb5d5f79..d0b227bc805b 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -22,7 +22,7 @@
 
 from ... import initialization as init
 from ...modeling_utils import PreTrainedModel
-from ...utils import ModelOutput, auto_docstring, logging, torch_int
+from ...utils import ModelOutput, auto_docstring, int_div_ceil, logging, torch_int
 from ..auto import AutoModel
 from .configuration_depth_pro import DepthProConfig
 
@@ -895,8 +895,8 @@ def __init__(self, config: DepthProConfig):
         for i in range(config.num_fov_head_layers):
             self.layers.append(
                 nn.Conv2d(
-                    math.ceil(self.fusion_hidden_size / 2 ** (i + 1)),
-                    math.ceil(self.fusion_hidden_size / 2 ** (i + 2)),
+                    int_div_ceil(self.fusion_hidden_size, 2 ** (i + 1)),
+                    int_div_ceil(self.fusion_hidden_size, 2 ** (i + 2)),
                     kernel_size=3,
                     stride=2,
                     padding=1,
@@ -904,7 +904,7 @@ def __init__(self, config: DepthProConfig):
             )
             self.layers.append(nn.ReLU(True))
         # calculate expected shapes to finally generate a scalar output from final head layer
-        final_in_channels = math.ceil(self.fusion_hidden_size / 2 ** (config.num_fov_head_layers + 1))
+        final_in_channels = int_div_ceil(self.fusion_hidden_size, 2 ** (config.num_fov_head_layers + 1))
         final_kernel_size = torch_int((self.out_size - 1) / 2**config.num_fov_head_layers + 1)
         self.layers.append(
             nn.Conv2d(
diff --git a/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py b/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py
index cb3ce6ec869e..94dd70944ce2 100644
--- a/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py
+++ b/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py
@@ -16,7 +16,6 @@
 
 import argparse
 import itertools
-import math
 from io import BytesIO
 from pathlib import Path
 
@@ -26,7 +25,7 @@
 from torchvision import transforms
 
 from transformers import Dinov2Config, DPTConfig, DPTForDepthEstimation, DPTImageProcessor
-from transformers.utils import logging
+from transformers.utils import int_div_ceil, logging
 
 
 logging.set_verbosity_info()
@@ -207,7 +206,7 @@ def __init__(self, multiple):
             self.multiple = multiple
 
         def _get_pad(self, size):
-            new_size = math.ceil(size / self.multiple) * self.multiple
+            new_size = int_div_ceil(size, self.multiple) * self.multiple
             pad_size = new_size - size
             pad_size_left = pad_size // 2
             pad_size_right = pad_size - pad_size_left
diff --git a/src/transformers/models/dpt/image_processing_dpt.py b/src/transformers/models/dpt/image_processing_dpt.py
index 1584d23849fd..b15cdca8f59b 100644
--- a/src/transformers/models/dpt/image_processing_dpt.py
+++ b/src/transformers/models/dpt/image_processing_dpt.py
@@ -47,6 +47,7 @@
 from ...utils import (
     TensorType,
     filter_out_non_signature_kwargs,
+    int_div_ceil,
     is_vision_available,
     logging,
     requires_backends,
@@ -97,7 +98,7 @@ def constrain_to_multiple_of(val, multiple, min_val=0, max_val=None):
             x = math.floor(val / multiple) * multiple
 
         if x < min_val:
-            x = math.ceil(val / multiple) * multiple
+            x = int_div_ceil(val, multiple) * multiple
 
         return x
 
@@ -289,7 +290,7 @@ def pad_image(
         """
 
         def _get_pad(size, size_divisor):
-            new_size = math.ceil(size / size_divisor) * size_divisor
+            new_size = int_div_ceil(size, size_divisor) * size_divisor
             pad_size = new_size - size
             pad_size_left = pad_size // 2
             pad_size_right = pad_size - pad_size_left
diff --git a/src/transformers/models/dpt/image_processing_dpt_fast.py b/src/transformers/models/dpt/image_processing_dpt_fast.py
index ccc41d950bce..6250c2309892 100644
--- a/src/transformers/models/dpt/image_processing_dpt_fast.py
+++ b/src/transformers/models/dpt/image_processing_dpt_fast.py
@@ -39,7 +39,7 @@
     is_torch_tensor,
 )
 from ...processing_utils import Unpack
-from ...utils import TensorType, auto_docstring, requires_backends
+from ...utils import TensorType, auto_docstring, int_div_ceil, requires_backends
 from .image_processing_dpt import DPTImageProcessorKwargs
 
 
@@ -60,7 +60,7 @@ def constrain_to_multiple_of(val, multiple, min_val=0, max_val=None):
             x = math.floor(val / multiple) * multiple
 
         if x < min_val:
-            x = math.ceil(val / multiple) * multiple
+            x = int_div_ceil(val, multiple) * multiple
 
         return x
 
@@ -325,7 +325,7 @@ def pad_image(
         height, width = image.shape[-2:]
 
         def _get_pad(size, size_divisor):
-            new_size = math.ceil(size / size_divisor) * size_divisor
+            new_size = int_div_ceil(size, size_divisor) * size_divisor
             pad_size = new_size - size
             pad_size_left = pad_size // 2
             pad_size_right = pad_size - pad_size_left
diff --git a/src/transformers/models/dpt/modular_dpt.py b/src/transformers/models/dpt/modular_dpt.py
index b0e0c8858ec7..e335f6b79e23 100644
--- a/src/transformers/models/dpt/modular_dpt.py
+++ b/src/transformers/models/dpt/modular_dpt.py
@@ -31,6 +31,7 @@
 from ...utils import (
     TensorType,
     auto_docstring,
+    int_div_ceil,
     requires_backends,
 )
 from ..beit.image_processing_beit_fast import BeitImageProcessorFast
@@ -56,7 +57,7 @@ def constrain_to_multiple_of(val, multiple, min_val=0, max_val=None):
             x = math.floor(val / multiple) * multiple
 
         if x < min_val:
-            x = math.ceil(val / multiple) * multiple
+            x = int_div_ceil(val, multiple) * multiple
 
         return x
 
@@ -160,7 +161,7 @@ def pad_image(
         height, width = image.shape[-2:]
 
         def _get_pad(size, size_divisor):
-            new_size = math.ceil(size / size_divisor) * size_divisor
+            new_size = int_div_ceil(size, size_divisor) * size_divisor
             pad_size = new_size - size
             pad_size_left = pad_size // 2
             pad_size_right = pad_size - pad_size_left
diff --git a/src/transformers/models/encodec/configuration_encodec.py b/src/transformers/models/encodec/configuration_encodec.py
index 9c5c0a10093d..b10f201c5c0c 100644
--- a/src/transformers/models/encodec/configuration_encodec.py
+++ b/src/transformers/models/encodec/configuration_encodec.py
@@ -18,7 +18,7 @@
 import numpy as np
 
 from ...configuration_utils import PreTrainedConfig
-from ...utils import auto_docstring, logging
+from ...utils import auto_docstring, int_div_ceil, logging
 
 
 logger = logging.get_logger(__name__)
@@ -169,7 +169,7 @@ def codebook_nbits(self) -> int:
 
     @property
     def frame_rate(self) -> int:
-        return math.ceil(self.sampling_rate / self.hop_length)
+        return int_div_ceil(self.sampling_rate, self.hop_length)
 
     @property
     def num_quantizers(self) -> int:
diff --git a/src/transformers/models/eomt/image_processing_eomt.py b/src/transformers/models/eomt/image_processing_eomt.py
index 9c41b78a6e0d..e62673bc6b04 100644
--- a/src/transformers/models/eomt/image_processing_eomt.py
+++ b/src/transformers/models/eomt/image_processing_eomt.py
@@ -13,8 +13,6 @@
 # limitations under the License.
 """Image processor class for EoMT."""
 
-import math
-
 import numpy as np
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
@@ -41,6 +39,7 @@
     IMAGENET_DEFAULT_STD,
     TensorType,
     filter_out_non_signature_kwargs,
+    int_div_ceil,
     is_torch_available,
     logging,
 )
@@ -347,7 +346,7 @@ def _split_image(self, image: ImageInput, size: dict, image_index: int) -> tuple
         patch_size = size["shortest_edge"]
 
         longer_side = max(image_size)
-        num_patches = math.ceil(longer_side / patch_size)
+        num_patches = int_div_ceil(longer_side, patch_size)
         total_overlap = num_patches * patch_size - longer_side
         overlap_per_patch = total_overlap / (num_patches - 1) if num_patches > 1 else 0
 
diff --git a/src/transformers/models/eomt/image_processing_eomt_fast.py b/src/transformers/models/eomt/image_processing_eomt_fast.py
index 69e8f1dadf42..cd3d3704f8ce 100644
--- a/src/transformers/models/eomt/image_processing_eomt_fast.py
+++ b/src/transformers/models/eomt/image_processing_eomt_fast.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 """Fast Image processor class for EoMT."""
 
-import math
 from typing import Optional, Union
 
 import numpy as np
@@ -39,6 +38,7 @@
     TensorType,
     auto_docstring,
     filter_out_non_signature_kwargs,
+    int_div_ceil,
 )
 from .image_processing_eomt import (
     EomtImageProcessorKwargs,
@@ -127,7 +127,7 @@ def _split_image(self, images: torch.Tensor, size: dict, image_indices: int) ->
         patch_size = size["shortest_edge"]
 
         longer_side = max(height, width)
-        num_patches = math.ceil(longer_side / patch_size)
+        num_patches = int_div_ceil(longer_side, patch_size)
         total_overlap = num_patches * patch_size - longer_side
         overlap_per_patch = total_overlap / (num_patches - 1) if num_patches > 1 else 0
 
diff --git a/src/transformers/models/falcon_mamba/configuration_falcon_mamba.py b/src/transformers/models/falcon_mamba/configuration_falcon_mamba.py
index cc7583fcbbd8..8aafb7b27e06 100644
--- a/src/transformers/models/falcon_mamba/configuration_falcon_mamba.py
+++ b/src/transformers/models/falcon_mamba/configuration_falcon_mamba.py
@@ -17,10 +17,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import math
-
 from ...configuration_utils import PreTrainedConfig
-from ...utils import auto_docstring
+from ...utils import auto_docstring, int_div_ceil
 
 
 @auto_docstring(checkpoint="tiiuae/falcon-mamba-7b")
@@ -117,7 +115,7 @@ def __init__(
         self.use_conv_bias = use_conv_bias
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
-        self.time_step_rank = math.ceil(self.hidden_size / 16) if time_step_rank == "auto" else time_step_rank
+        self.time_step_rank = int_div_ceil(self.hidden_size, 16) if time_step_rank == "auto" else time_step_rank
         self.time_step_scale = time_step_scale
         self.time_step_min = time_step_min
         self.time_step_max = time_step_max
diff --git a/src/transformers/models/fuyu/image_processing_fuyu.py b/src/transformers/models/fuyu/image_processing_fuyu.py
index c8024210ed9c..3b0aeb6a9865 100644
--- a/src/transformers/models/fuyu/image_processing_fuyu.py
+++ b/src/transformers/models/fuyu/image_processing_fuyu.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 """Image processor class for Fuyu."""
 
-import math
 
 import numpy as np
 
@@ -40,6 +39,7 @@
 from ...utils import (
     TensorType,
     filter_out_non_signature_kwargs,
+    int_div_ceil,
     is_torch_available,
     is_torch_device,
     is_torch_dtype,
@@ -655,16 +655,8 @@ def preprocess_with_tokenizer_info(
                     image = image_input[batch_index, subseq_index]
                     image_height, image_width = image.shape[1], image.shape[2]
                     if variable_sized:
-                        # The min() is required here due to floating point issues:
-                        # math.ceil(torch.tensor(300).cuda() / 30) == 11
-                        new_h = min(
-                            image_height,
-                            math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height,
-                        )
-                        new_w = min(
-                            image_width,
-                            math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width,
-                        )
+                        new_h = int_div_ceil(image_unpadded_h[batch_index, subseq_index], patch_height) * patch_height
+                        new_w = int_div_ceil(image_unpadded_w[batch_index, subseq_index], patch_width) * patch_width
                         image = image[:, :new_h, :new_w]
                         image_height, image_width = new_h, new_w
 
diff --git a/src/transformers/models/fuyu/image_processing_fuyu_fast.py b/src/transformers/models/fuyu/image_processing_fuyu_fast.py
index 633d65dd1b55..f9cb5aa032ed 100644
--- a/src/transformers/models/fuyu/image_processing_fuyu_fast.py
+++ b/src/transformers/models/fuyu/image_processing_fuyu_fast.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 """Fast Image processor class for Fuyu."""
 
-import math
 from typing import Optional
 
 import torch
@@ -32,6 +31,7 @@
 from ...utils import (
     TensorType,
     auto_docstring,
+    int_div_ceil,
     is_torchvision_available,
     logging,
     requires_backends,
@@ -286,15 +286,8 @@ def preprocess_with_tokenizer_info(
                     image_height, image_width = image.shape[1], image.shape[2]
                     if variable_sized:
                         # Calculate new dimensions based on unpadded size
-                        # The min() is required here due to floating point issues
-                        new_h = min(
-                            image_height,
-                            math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height,
-                        )
-                        new_w = min(
-                            image_width,
-                            math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width,
-                        )
+                        new_h = int_div_ceil(image_unpadded_h[batch_index, subseq_index], patch_height) * patch_height
+                        new_w = int_div_ceil(image_unpadded_w[batch_index, subseq_index], patch_width) * patch_width
                         image = image[:, :new_h, :new_w]
                         image_height, image_width = new_h, new_w
                     num_patches = self.get_num_patches(
diff --git a/src/transformers/models/gemma3/image_processing_gemma3.py b/src/transformers/models/gemma3/image_processing_gemma3.py
index 8a185eef8cd3..87713b17aab9 100644
--- a/src/transformers/models/gemma3/image_processing_gemma3.py
+++ b/src/transformers/models/gemma3/image_processing_gemma3.py
@@ -39,7 +39,7 @@
     validate_preprocess_arguments,
 )
 from ...processing_utils import ImagesKwargs
-from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
+from ...utils import TensorType, filter_out_non_signature_kwargs, int_div_ceil, is_vision_available, logging
 
 
 logger = logging.get_logger(__name__)
@@ -206,8 +206,8 @@ def pan_and_scan(
             num_crops_h = min(pan_and_scan_max_num_crops, num_crops_h)
             num_crops_w = 1
 
-        crop_size_w = int(math.ceil(width / num_crops_w))
-        crop_size_h = int(math.ceil(height / num_crops_h))
+        crop_size_w = int_div_ceil(width, num_crops_w)
+        crop_size_h = int_div_ceil(height, num_crops_h)
 
         # Don't apply PaS if crop size is too small.
         if min(crop_size_w, crop_size_h) < pan_and_scan_min_crop_size:
diff --git a/src/transformers/models/gemma3/image_processing_gemma3_fast.py b/src/transformers/models/gemma3/image_processing_gemma3_fast.py
index ca40bc945ceb..e674fdfb3589 100644
--- a/src/transformers/models/gemma3/image_processing_gemma3_fast.py
+++ b/src/transformers/models/gemma3/image_processing_gemma3_fast.py
@@ -31,6 +31,7 @@
 from ...utils import (
     TensorType,
     auto_docstring,
+    int_div_ceil,
     logging,
 )
 from .image_processing_gemma3 import Gemma3ImageProcessorKwargs
@@ -112,8 +113,8 @@ def pan_and_scan_batched(
             num_crops_h = min(pan_and_scan_max_num_crops, num_crops_h)
             num_crops_w = 1
 
-        crop_size_w = int(math.ceil(width / num_crops_w))
-        crop_size_h = int(math.ceil(height / num_crops_h))
+        crop_size_w = int_div_ceil(width, num_crops_w)
+        crop_size_h = int_div_ceil(height, num_crops_h)
 
         # Don't apply PaS if crop size is too small.
         if min(crop_size_w, crop_size_h) < pan_and_scan_min_crop_size:
diff --git a/src/transformers/models/granite_speech/feature_extraction_granite_speech.py b/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
index cd32d0433bae..9c239ba57727 100644
--- a/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
+++ b/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
@@ -13,14 +13,13 @@
 # limitations under the License.
 """Feature extractor class for Granite Speech."""
 
-import math
 from collections.abc import Sequence
 
 import numpy as np
 
 from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
 from ...tokenization_utils_base import AudioInput
-from ...utils import is_torch_available, is_torchaudio_available, logging
+from ...utils import int_div_ceil, is_torch_available, is_torchaudio_available, logging
 from ...utils.import_utils import requires_backends
 
 
@@ -135,7 +134,7 @@ def _get_num_audio_features(self, audio_lengths: Sequence[int]) -> Sequence[int]
             mel_length = raw_length // hop_length + 1
             # encoder frame takes two mel features
             encoder_length = mel_length // 2
-            nblocks = math.ceil(encoder_length / self.projector_window_size)
+            nblocks = int_div_ceil(encoder_length, self.projector_window_size)
             # projector output length
             projector_length = nblocks * effective_window_size
             projector_lengths.append(projector_length)
diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 43359ec98b7e..0ae7ec6c7220 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import math
 from dataclasses import dataclass
 
 import torch
@@ -29,6 +28,7 @@
     TransformersKwargs,
     auto_docstring,
     can_return_tuple,
+    int_div_ceil,
     is_peft_available,
     logging,
     torch_compilable_check,
@@ -86,7 +86,7 @@ def __init__(self, config: GraniteSpeechConfig):
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         batch_size, seq_len, dim = hidden_states.size()
-        nblocks = math.ceil(seq_len / self.window_size)
+        nblocks = int_div_ceil(seq_len, self.window_size)
         pad = nblocks * self.window_size - seq_len
         hidden_states = nn.functional.pad(hidden_states, (0, 0, 0, pad), "constant", 0)
         hidden_states = hidden_states.view(batch_size * nblocks, self.window_size, dim)
@@ -152,7 +152,7 @@ def forward(self, hidden_states: torch.Tensor, attention_dists: torch.Tensor) ->
         hidden_states = self.pre_norm(hidden_states)
         bsz, num_features, _ = hidden_states.shape
 
-        num_blocks = math.ceil(num_features / self.context_size)
+        num_blocks = int_div_ceil(num_features, self.context_size)
         remainder = num_features % self.context_size
         if remainder > 0:
             # right padding to reach block size
diff --git a/src/transformers/models/higgs_audio_v2_tokenizer/configuration_higgs_audio_v2_tokenizer.py b/src/transformers/models/higgs_audio_v2_tokenizer/configuration_higgs_audio_v2_tokenizer.py
index 423241c83306..9dcc617d50f9 100644
--- a/src/transformers/models/higgs_audio_v2_tokenizer/configuration_higgs_audio_v2_tokenizer.py
+++ b/src/transformers/models/higgs_audio_v2_tokenizer/configuration_higgs_audio_v2_tokenizer.py
@@ -24,7 +24,7 @@
 import numpy as np
 
 from ...configuration_utils import PreTrainedConfig
-from ...utils import auto_docstring
+from ...utils import auto_docstring, int_div_ceil
 from ..auto import CONFIG_MAPPING, AutoConfig
 
 
@@ -147,7 +147,7 @@ def __init__(
 
     @property
     def frame_rate(self) -> int:
-        return math.ceil(self.sample_rate / self.hop_length)
+        return int_div_ceil(self.sample_rate, self.hop_length)
 
     @property
     def semantic_hidden_size(self) -> int:
diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
index 8539fcbef903..4a59b2eb23b1 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import math
 from collections.abc import Iterable
 from typing import Any
 
@@ -35,7 +34,7 @@
     validate_preprocess_arguments,
 )
 from ...processing_utils import ImagesKwargs
-from ...utils import TensorType, is_vision_available, logging
+from ...utils import TensorType, int_div_ceil, is_vision_available, logging
 
 
 logger = logging.get_logger(__name__)
@@ -431,11 +430,11 @@ def split_image(
         frames = []
         if height > max_height or width > max_width:
             # Calculate the number of splits
-            num_splits_h = math.ceil(height / max_height)
-            num_splits_w = math.ceil(width / max_width)
+            num_splits_h = int_div_ceil(height, max_height)
+            num_splits_w = int_div_ceil(width, max_width)
             # Calculate the optimal width and height for the sub-images
-            optimal_height = math.ceil(height / num_splits_h)
-            optimal_width = math.ceil(width / num_splits_w)
+            optimal_height = int_div_ceil(height, num_splits_h)
+            optimal_width = int_div_ceil(width, num_splits_w)
 
             # Iterate through each row and column
             for r in range(num_splits_h):
@@ -502,13 +501,13 @@ def resize_for_vision_encoder(
 
         aspect_ratio = width / height
         if width >= height:
-            width = math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size
+            width = int_div_ceil(width, vision_encoder_max_size) * vision_encoder_max_size
             height = int(width / aspect_ratio)
-            height = math.ceil(height / vision_encoder_max_size) * vision_encoder_max_size
+            height = int_div_ceil(height, vision_encoder_max_size) * vision_encoder_max_size
         elif height > width:
-            height = math.ceil(height / vision_encoder_max_size) * vision_encoder_max_size
+            height = int_div_ceil(height, vision_encoder_max_size) * vision_encoder_max_size
             width = int(height * aspect_ratio)
-            width = math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size
+            width = int_div_ceil(width, vision_encoder_max_size) * vision_encoder_max_size
         new_size = {"height": height, "width": width}
         return self.resize(
             image, size=new_size, resample=resample, input_data_format=input_data_format, data_format=data_format
@@ -893,19 +892,19 @@ def get_number_of_image_patches(self, height: int, width: int, images_kwargs: di
             aspect_ratio = width / height
 
             if width >= height:
-                resized_width = math.ceil(width / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
+                resized_width = int_div_ceil(width, max_image_size["longest_edge"]) * max_image_size["longest_edge"]
                 resized_height = int(width / aspect_ratio)
-                resized_height = math.ceil(height / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
+                resized_height = int_div_ceil(height, max_image_size["longest_edge"]) * max_image_size["longest_edge"]
             elif height > width:
-                resized_height = math.ceil(height / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
+                resized_height = int_div_ceil(height, max_image_size["longest_edge"]) * max_image_size["longest_edge"]
                 resized_width = int(height * aspect_ratio)
-                resized_width = math.ceil(width / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
+                resized_width = int_div_ceil(width, max_image_size["longest_edge"]) * max_image_size["longest_edge"]
 
             max_height = max_width = max_image_size["longest_edge"]
             if resized_height > max_height or resized_width > max_width:
                 # Calculate the number of splits
-                num_rows = math.ceil(resized_height / max_height)
-                num_cols = math.ceil(resized_width / max_width)
+                num_rows = int_div_ceil(resized_height, max_height)
+                num_cols = int_div_ceil(resized_width, max_width)
                 num_patches = num_rows * num_cols + 1
 
         return num_patches, num_rows, num_cols
diff --git a/src/transformers/models/idefics3/image_processing_idefics3_fast.py b/src/transformers/models/idefics3/image_processing_idefics3_fast.py
index f2795ebfd64d..2f22d6ac7fdb 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3_fast.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3_fast.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
-import math
 from typing import Optional
 
 import torch
@@ -33,7 +31,7 @@
     make_nested_list_of_images,
 )
 from ...processing_utils import Unpack
-from ...utils import TensorType, auto_docstring, is_torchvision_available, logging
+from ...utils import TensorType, auto_docstring, int_div_ceil, is_torchvision_available, logging
 from .image_processing_idefics3 import Idefics3ImageProcessorKwargs
 
 
@@ -284,8 +282,8 @@ def split_images(
         frames = []
         if height > max_height or width > max_width:
             # Calculate the number of splits
-            num_splits_h = math.ceil(height / max_height)
-            num_splits_w = math.ceil(width / max_width)
+            num_splits_h = int_div_ceil(height, max_height)
+            num_splits_w = int_div_ceil(width, max_width)
 
             # Split the images by height, then by width
             frames = (
@@ -333,13 +331,13 @@ def resize_for_vision_encoder(
 
         aspect_ratio = width / height
         if width >= height:
-            width = math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size
+            width = int_div_ceil(width, vision_encoder_max_size) * vision_encoder_max_size
             height = int(width / aspect_ratio)
-            height = math.ceil(height / vision_encoder_max_size) * vision_encoder_max_size
+            height = int_div_ceil(height, vision_encoder_max_size) * vision_encoder_max_size
         elif height > width:
-            height = math.ceil(height / vision_encoder_max_size) * vision_encoder_max_size
+            height = int_div_ceil(height, vision_encoder_max_size) * vision_encoder_max_size
             width = int(height * aspect_ratio)
-            width = math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size
+            width = int_div_ceil(width, vision_encoder_max_size) * vision_encoder_max_size
         new_size = SizeDict(height=height, width=width)
         return self.resize(image, size=new_size, interpolation=interpolation)
 
@@ -530,19 +528,19 @@ def get_number_of_image_patches(self, height: int, width: int, images_kwargs: di
             aspect_ratio = width / height
 
             if width >= height:
-                resized_width = math.ceil(width / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
+                resized_width = int_div_ceil(width, max_image_size["longest_edge"]) * max_image_size["longest_edge"]
                 resized_height = int(width / aspect_ratio)
-                resized_height = math.ceil(height / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
+                resized_height = int_div_ceil(height, max_image_size["longest_edge"]) * max_image_size["longest_edge"]
             elif height > width:
-                resized_height = math.ceil(height / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
+                resized_height = int_div_ceil(height, max_image_size["longest_edge"]) * max_image_size["longest_edge"]
                 resized_width = int(height * aspect_ratio)
-                resized_width = math.ceil(width / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
+                resized_width = int_div_ceil(width, max_image_size["longest_edge"]) * max_image_size["longest_edge"]
 
             max_height = max_width = max_image_size["longest_edge"]
             if resized_height > max_height or resized_width > max_width:
                 # Calculate the number of splits
-                num_rows = math.ceil(resized_height / max_height)
-                num_cols = math.ceil(resized_width / max_width)
+                num_rows = int_div_ceil(resized_height, max_height)
+                num_cols = int_div_ceil(resized_width, max_width)
                 num_patches = num_rows * num_cols + 1
 
         return num_patches, num_rows, num_cols
diff --git a/src/transformers/models/jamba/configuration_jamba.py b/src/transformers/models/jamba/configuration_jamba.py
index 2ea28bfed57a..a97f92ff4bdb 100644
--- a/src/transformers/models/jamba/configuration_jamba.py
+++ b/src/transformers/models/jamba/configuration_jamba.py
@@ -13,10 +13,9 @@
 # limitations under the License.
 """Jamba model configuration"""
 
-import math
 
 from ...configuration_utils import PreTrainedConfig
-from ...utils import auto_docstring, logging
+from ...utils import auto_docstring, int_div_ceil, logging
 
 
 logger = logging.get_logger(__name__)
@@ -118,7 +117,7 @@ def __init__(
         self.mamba_d_state = mamba_d_state
         self.mamba_d_conv = mamba_d_conv
         self.mamba_expand = mamba_expand
-        self.mamba_dt_rank = math.ceil(self.hidden_size / 16) if mamba_dt_rank == "auto" else mamba_dt_rank
+        self.mamba_dt_rank = int_div_ceil(self.hidden_size, 16) if mamba_dt_rank == "auto" else mamba_dt_rank
         self.mamba_conv_bias = mamba_conv_bias
         self.mamba_proj_bias = mamba_proj_bias
 
diff --git a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
index 5b8811d615ce..1e1e2cf534a7 100755
--- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
@@ -32,7 +32,7 @@
 from ...modeling_utils import PreTrainedModel
 from ...processing_utils import Unpack
 from ...pytorch_utils import apply_chunking_to_forward
-from ...utils import auto_docstring, is_detectron2_available, logging, requires_backends
+from ...utils import auto_docstring, int_div_ceil, is_detectron2_available, logging, requires_backends
 from ...utils.generic import TransformersKwargs, can_return_tuple, merge_with_config_defaults
 from ...utils.output_capturing import capture_outputs
 from .configuration_layoutlmv2 import LayoutLMv2Config
@@ -500,8 +500,8 @@ def __init__(self, config):
             backbone_stride = self.backbone.output_shape()[self.out_feature_key].stride
             self.pool = nn.AvgPool2d(
                 (
-                    math.ceil(math.ceil(input_shape[0] / backbone_stride) / config.image_feature_pool_shape[0]),
-                    math.ceil(math.ceil(input_shape[1] / backbone_stride) / config.image_feature_pool_shape[1]),
+                    int_div_ceil(int_div_ceil(input_shape[0], backbone_stride), config.image_feature_pool_shape[0]),
+                    int_div_ceil(int_div_ceil(input_shape[1], backbone_stride), config.image_feature_pool_shape[1]),
                 )
             )
         else:
diff --git a/src/transformers/models/lfm2_vl/processing_lfm2_vl.py b/src/transformers/models/lfm2_vl/processing_lfm2_vl.py
index bf654310d0d3..5322270b288c 100755
--- a/src/transformers/models/lfm2_vl/processing_lfm2_vl.py
+++ b/src/transformers/models/lfm2_vl/processing_lfm2_vl.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import math
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, make_nested_list_of_images
@@ -22,7 +21,7 @@
     Unpack,
 )
 from ...tokenization_utils_base import BatchEncoding, TextInput
-from ...utils import auto_docstring, logging
+from ...utils import auto_docstring, int_div_ceil, logging
 
 
 logger = logging.get_logger(__name__)
@@ -215,14 +214,14 @@ def _build_image_tokens(
     def _compute_tokens_per_tile(self, tile_size: int, encoder_patch_size: int, downsample_factor: int) -> int:
         """Compute the number of tokens for a single tile."""
         num_patches = tile_size // encoder_patch_size
-        downsampled_patches = math.ceil(num_patches / downsample_factor)
+        downsampled_patches = int_div_ceil(num_patches, downsample_factor)
         return downsampled_patches * downsampled_patches
 
     def _compute_tokens_for_image(self, image_size: list[int], encoder_patch_size: int, downsample_factor: int) -> int:
         """Compute the number of tokens for a resized image (used for single-tile or thumbnail)."""
         image_height, image_width = image_size
-        patches_h = math.ceil((image_height // encoder_patch_size) / downsample_factor)
-        patches_w = math.ceil((image_width // encoder_patch_size) / downsample_factor)
+        patches_h = int_div_ceil(image_height // encoder_patch_size, downsample_factor)
+        patches_w = int_div_ceil(image_width // encoder_patch_size, downsample_factor)
         return patches_h * patches_w
 
     def _get_image_num_tokens(self, image_size: list[int], **images_kwargs) -> tuple[int, int]:
diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py
index b01d3d00edf2..7e4b168c3253 100644
--- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py
@@ -34,7 +34,7 @@
 from ...modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPooling, ModelOutput
 from ...modeling_utils import PreTrainedModel
 from ...processing_utils import Unpack
-from ...utils import TransformersKwargs, auto_docstring, torch_compilable_check
+from ...utils import TransformersKwargs, auto_docstring, int_div_ceil, torch_compilable_check
 from ...utils.generic import can_return_tuple, merge_with_config_defaults
 from ..auto import AutoModel
 from .configuration_llava_onevision import LlavaOnevisionConfig
@@ -621,7 +621,7 @@ def apply_pooling(self, image_features):
         image_features = image_features.permute(0, 3, 1, 2).contiguous()
 
         height, width = image_features.shape[2:]
-        scaled_shape = [math.ceil(height / 2), math.ceil(width / 2)]
+        scaled_shape = [int_div_ceil(height, 2), int_div_ceil(width, 2)]
         image_features = nn.functional.interpolate(image_features, size=scaled_shape, mode="bilinear")
 
         image_features = image_features.permute(0, 2, 3, 1)
diff --git a/src/transformers/models/llava_onevision/modular_llava_onevision.py b/src/transformers/models/llava_onevision/modular_llava_onevision.py
index cb535b276ca2..d4874f65dd60 100644
--- a/src/transformers/models/llava_onevision/modular_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/modular_llava_onevision.py
@@ -47,7 +47,7 @@
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_outputs import BaseModelOutputWithPooling
 from ...processing_utils import Unpack
-from ...utils import TensorType, auto_docstring, logging
+from ...utils import TensorType, auto_docstring, int_div_ceil, logging
 from ...utils.generic import can_return_tuple, merge_with_config_defaults
 from .image_processing_llava_onevision import LlavaOnevisionImageProcessorKwargs
 
@@ -303,7 +303,7 @@ def apply_pooling(self, image_features):
         image_features = image_features.permute(0, 3, 1, 2).contiguous()
 
         height, width = image_features.shape[2:]
-        scaled_shape = [math.ceil(height / 2), math.ceil(width / 2)]
+        scaled_shape = [int_div_ceil(height, 2), int_div_ceil(width, 2)]
         image_features = nn.functional.interpolate(image_features, size=scaled_shape, mode="bilinear")
 
         image_features = image_features.permute(0, 2, 3, 1)
diff --git a/src/transformers/models/llava_onevision/processing_llava_onevision.py b/src/transformers/models/llava_onevision/processing_llava_onevision.py
index 3bd407123864..f4670167d077 100644
--- a/src/transformers/models/llava_onevision/processing_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/processing_llava_onevision.py
@@ -25,7 +25,7 @@
 from ...image_utils import ImageInput, get_image_size, to_numpy_array
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
-from ...utils import auto_docstring, logging
+from ...utils import auto_docstring, int_div_ceil, logging
 from ...video_utils import VideoInput
 
 
@@ -146,7 +146,7 @@ def __call__(
             height, width = get_image_size(one_video[0], channel_dim=output_kwargs["images_kwargs"].get("data_format"))
             num_frames = one_video.shape[0]  # frame dim is always after batch dim
             patches_height_width = int(math.sqrt(self.num_image_tokens))
-            pooled_height_width = math.ceil(patches_height_width / 2)
+            pooled_height_width = int_div_ceil(patches_height_width, 2)
             num_video_tokens = (num_frames * pooled_height_width * pooled_height_width) + 1  # +1 for newline token
             text = [sample.replace(self.video_token, self.video_token * num_video_tokens) for sample in text]
 
diff --git a/src/transformers/models/mamba/configuration_mamba.py b/src/transformers/models/mamba/configuration_mamba.py
index 68533fe90c0b..c3997924b5fd 100644
--- a/src/transformers/models/mamba/configuration_mamba.py
+++ b/src/transformers/models/mamba/configuration_mamba.py
@@ -13,10 +13,9 @@
 # limitations under the License.
 """MAMBA configuration"""
 
-import math
 
 from ...configuration_utils import PreTrainedConfig
-from ...utils import auto_docstring, logging
+from ...utils import auto_docstring, int_div_ceil, logging
 
 
 logger = logging.get_logger(__name__)
@@ -109,7 +108,7 @@ def __init__(
         self.use_conv_bias = use_conv_bias
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
-        self.time_step_rank = math.ceil(self.hidden_size / 16) if time_step_rank == "auto" else time_step_rank
+        self.time_step_rank = int_div_ceil(self.hidden_size, 16) if time_step_rank == "auto" else time_step_rank
         self.time_step_scale = time_step_scale
         self.time_step_min = time_step_min
         self.time_step_max = time_step_max
diff --git a/src/transformers/models/mamba/convert_mamba_ssm_checkpoint_to_pytorch.py b/src/transformers/models/mamba/convert_mamba_ssm_checkpoint_to_pytorch.py
index 96dfbbc1d4a7..3f4461b29eef 100644
--- a/src/transformers/models/mamba/convert_mamba_ssm_checkpoint_to_pytorch.py
+++ b/src/transformers/models/mamba/convert_mamba_ssm_checkpoint_to_pytorch.py
@@ -15,12 +15,11 @@
 
 import argparse
 import json
-import math
 
 import torch
 
 from transformers import AutoTokenizer, MambaConfig, MambaForCausalLM
-from transformers.utils import logging
+from transformers.utils import int_div_ceil, logging
 from transformers.utils.import_utils import is_mamba_ssm_available
 
 
@@ -34,7 +33,7 @@ def convert_ssm_config_to_hf_config(config_ssm: MambaConfigSSM) -> MambaConfig:
         # Set config hidden size, num hidden layers, and vocab size directly from the original config
         hf_config.hidden_size = config_ssm.d_model
         hf_config.intermediate_size = config_ssm.d_model * 2
-        hf_config.time_step_rank = math.ceil(config_ssm.d_model / 16)
+        hf_config.time_step_rank = int_div_ceil(config_ssm.d_model, 16)
 
         hf_config.num_hidden_layers = config_ssm.n_layer
         vocab_size = config_ssm.vocab_size
diff --git a/src/transformers/models/mamba2/configuration_mamba2.py b/src/transformers/models/mamba2/configuration_mamba2.py
index 575d61520393..96bfe4f23f7b 100644
--- a/src/transformers/models/mamba2/configuration_mamba2.py
+++ b/src/transformers/models/mamba2/configuration_mamba2.py
@@ -13,10 +13,9 @@
 # limitations under the License.
 """MAMBA2 configuration"""
 
-import math
 
 from ...configuration_utils import PreTrainedConfig
-from ...utils import auto_docstring, logging
+from ...utils import auto_docstring, int_div_ceil, logging
 
 
 logger = logging.get_logger(__name__)
@@ -113,7 +112,7 @@ def __init__(
         self.use_conv_bias = use_conv_bias
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
-        self.time_step_rank = math.ceil(self.hidden_size / 16) if time_step_rank == "auto" else time_step_rank
+        self.time_step_rank = int_div_ceil(self.hidden_size, 16) if time_step_rank == "auto" else time_step_rank
         self.time_step_min = time_step_min
         self.time_step_max = time_step_max
         self.time_step_floor = time_step_floor
diff --git a/src/transformers/models/mask2former/image_processing_mask2former.py b/src/transformers/models/mask2former/image_processing_mask2former.py
index b22bddf8d044..0a15a55e6ee4 100644
--- a/src/transformers/models/mask2former/image_processing_mask2former.py
+++ b/src/transformers/models/mask2former/image_processing_mask2former.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 """Image processor class for Mask2Former."""
 
-import math
 from collections.abc import Iterable
 from typing import Any
 
@@ -46,6 +45,7 @@
     IMAGENET_DEFAULT_STD,
     TensorType,
     filter_out_non_signature_kwargs,
+    int_div_ceil,
     is_torch_available,
     is_torch_tensor,
     logging,
@@ -360,8 +360,8 @@ def get_mask2former_resize_output_image_size(
 
     if size_divisor > 0:
         height, width = output_size
-        height = int(math.ceil(height / size_divisor) * size_divisor)
-        width = int(math.ceil(width / size_divisor) * size_divisor)
+        height = int_div_ceil(height, size_divisor) * size_divisor
+        width = int_div_ceil(width, size_divisor) * size_divisor
         output_size = (height, width)
 
     return output_size
diff --git a/src/transformers/models/mask2former/image_processing_mask2former_fast.py b/src/transformers/models/mask2former/image_processing_mask2former_fast.py
index 194b2aff9ea3..966ca32b5fd2 100644
--- a/src/transformers/models/mask2former/image_processing_mask2former_fast.py
+++ b/src/transformers/models/mask2former/image_processing_mask2former_fast.py
@@ -18,7 +18,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import math
 from typing import Any, Optional, Union
 
 import torch
@@ -44,7 +43,7 @@
     PILImageResampling,
 )
 from ...processing_utils import Unpack
-from ...utils import TensorType, auto_docstring, logging
+from ...utils import TensorType, auto_docstring, int_div_ceil, logging
 from .image_processing_mask2former import (
     Mask2FormerImageProcessorKwargs,
     compute_segments,
@@ -189,8 +188,8 @@ def resize(
             )
         if size_divisor > 0:
             height, width = new_size
-            height = int(math.ceil(height / size_divisor) * size_divisor)
-            width = int(math.ceil(width / size_divisor) * size_divisor)
+            height = int_div_ceil(height, size_divisor) * size_divisor
+            width = int_div_ceil(width, size_divisor) * size_divisor
             new_size = (height, width)
 
         image = tvF.resize(
diff --git a/src/transformers/models/maskformer/image_processing_maskformer.py b/src/transformers/models/maskformer/image_processing_maskformer.py
index 44f5aa24e96e..4cf0ea051116 100644
--- a/src/transformers/models/maskformer/image_processing_maskformer.py
+++ b/src/transformers/models/maskformer/image_processing_maskformer.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 """Image processor class for MaskFormer."""
 
-import math
 from collections.abc import Iterable
 from typing import Any
 
@@ -46,6 +45,7 @@
     IMAGENET_DEFAULT_STD,
     TensorType,
     filter_out_non_signature_kwargs,
+    int_div_ceil,
     is_torch_available,
     is_torch_tensor,
     logging,
@@ -359,8 +359,8 @@ def get_maskformer_resize_output_image_size(
 
     if size_divisor > 0:
         height, width = output_size
-        height = int(math.ceil(height / size_divisor) * size_divisor)
-        width = int(math.ceil(width / size_divisor) * size_divisor)
+        height = int_div_ceil(height, size_divisor) * size_divisor
+        width = int_div_ceil(width, size_divisor) * size_divisor
         output_size = (height, width)
 
     return output_size
diff --git a/src/transformers/models/maskformer/image_processing_maskformer_fast.py b/src/transformers/models/maskformer/image_processing_maskformer_fast.py
index 33abc9b8f38b..9b62b8301d6a 100644
--- a/src/transformers/models/maskformer/image_processing_maskformer_fast.py
+++ b/src/transformers/models/maskformer/image_processing_maskformer_fast.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 """Fast Image processor class for MaskFormer."""
 
-import math
 from typing import TYPE_CHECKING, Any, Optional, Union
 
 import torch
@@ -42,6 +41,7 @@
 from ...utils import (
     TensorType,
     auto_docstring,
+    int_div_ceil,
     logging,
 )
 from .image_processing_maskformer import (
@@ -192,8 +192,8 @@ def resize(
             )
         if size_divisor > 0:
             height, width = new_size
-            height = int(math.ceil(height / size_divisor) * size_divisor)
-            width = int(math.ceil(width / size_divisor) * size_divisor)
+            height = int_div_ceil(height, size_divisor) * size_divisor
+            width = int_div_ceil(width, size_divisor) * size_divisor
             new_size = (height, width)
 
         image = tvF.resize(
diff --git a/src/transformers/models/metaclip_2/convert_metaclip_2_to_hf.py b/src/transformers/models/metaclip_2/convert_metaclip_2_to_hf.py
index ae3a682fdb58..6db36199dca3 100644
--- a/src/transformers/models/metaclip_2/convert_metaclip_2_to_hf.py
+++ b/src/transformers/models/metaclip_2/convert_metaclip_2_to_hf.py
@@ -25,7 +25,6 @@
 
 # Import MetaCLIP modules
 from src.mini_clip.factory import create_model_and_transforms
-
 from transformers import (
     AutoTokenizer,
     CLIPImageProcessor,
diff --git a/src/transformers/models/mimi/configuration_mimi.py b/src/transformers/models/mimi/configuration_mimi.py
index 8614bb5da840..ecb60eab0f77 100644
--- a/src/transformers/models/mimi/configuration_mimi.py
+++ b/src/transformers/models/mimi/configuration_mimi.py
@@ -19,7 +19,7 @@
 
 from ...configuration_utils import PreTrainedConfig
 from ...modeling_rope_utils import RopeParameters
-from ...utils import auto_docstring, logging
+from ...utils import auto_docstring, int_div_ceil, logging
 
 
 logger = logging.get_logger(__name__)
@@ -185,7 +185,7 @@ def __init__(
     @property
     def encodec_frame_rate(self) -> int:
         hop_length = np.prod(self.upsampling_ratios)
-        return math.ceil(self.sampling_rate / hop_length)
+        return int_div_ceil(self.sampling_rate, int(hop_length))
 
     @property
     def num_codebooks(self) -> int:
diff --git a/src/transformers/models/mllama/convert_mllama_weights_to_hf.py b/src/transformers/models/mllama/convert_mllama_weights_to_hf.py
index 857987f65ac9..970b5bdd38d0 100644
--- a/src/transformers/models/mllama/convert_mllama_weights_to_hf.py
+++ b/src/transformers/models/mllama/convert_mllama_weights_to_hf.py
@@ -15,7 +15,6 @@
 import argparse
 import gc
 import json
-import math
 import os
 
 import regex as re
@@ -32,6 +31,7 @@
 from transformers.convert_slow_tokenizer import TikTokenConverter
 from transformers.models.mllama.configuration_mllama import MllamaTextConfig, MllamaVisionConfig
 from transformers.models.mllama.image_processing_mllama import get_all_supported_aspect_ratios
+from transformers.utils import int_div_ceil
 
 
 # fmt: off
@@ -257,7 +257,7 @@ def write_model(
         text_key_value_dim = text_dim
 
     # cross-attention layers: 20 for 90B, 8 for 11B
-    cross_attention_frequency = math.ceil(text_num_layers / cross_attention_num_layers)
+    cross_attention_frequency = int_div_ceil(text_num_layers, cross_attention_num_layers)
     text_num_total_layers = text_num_layers + cross_attention_num_layers
     cross_attention_layers_shift = list(
         range(cross_attention_frequency - 1, text_num_total_layers, cross_attention_frequency + 1)
diff --git a/src/transformers/models/mobilevit/modeling_mobilevit.py b/src/transformers/models/mobilevit/modeling_mobilevit.py
index bdb4953da3e0..d5c94c86d5b6 100755
--- a/src/transformers/models/mobilevit/modeling_mobilevit.py
+++ b/src/transformers/models/mobilevit/modeling_mobilevit.py
@@ -31,7 +31,7 @@
     SemanticSegmenterOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...utils import auto_docstring, logging, torch_int
+from ...utils import auto_docstring, int_div_ceil, logging, torch_int
 from .configuration_mobilevit import MobileViTConfig
 
 
@@ -405,12 +405,12 @@ def unfolding(self, features: torch.Tensor) -> tuple[torch.Tensor, dict]:
         new_height = (
             torch_int(torch.ceil(orig_height / patch_height) * patch_height)
             if torch.jit.is_tracing()
-            else int(math.ceil(orig_height / patch_height) * patch_height)
+            else int_div_ceil(orig_height, patch_height) * patch_height
         )
         new_width = (
             torch_int(torch.ceil(orig_width / patch_width) * patch_width)
             if torch.jit.is_tracing()
-            else int(math.ceil(orig_width / patch_width) * patch_width)
+            else int_div_ceil(orig_width, patch_width) * patch_width
         )
 
         interpolate = False
diff --git a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
index 084d0ac93a0e..d0fe13b57635 100644
--- a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
+++ b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
@@ -43,7 +43,7 @@
 from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
-from ...utils import TransformersKwargs, auto_docstring, logging
+from ...utils import TransformersKwargs, auto_docstring, int_div_ceil, logging
 from ...utils.generic import merge_with_config_defaults
 from ...utils.output_capturing import OutputRecorder, capture_outputs
 from ..auto.configuration_auto import AutoConfig
@@ -1539,7 +1539,7 @@ def forward(
 
                 # pad or truncate to config.chroma_length
                 if audio_hidden_states.shape[1] < self.config.chroma_length:
-                    n_repeat = int(math.ceil(self.config.chroma_length / audio_hidden_states.shape[1]))
+                    n_repeat = int_div_ceil(self.config.chroma_length, audio_hidden_states.shape[1])
                     audio_hidden_states = audio_hidden_states.repeat(1, n_repeat, 1)
                 else:
                     logger.warning(
@@ -1771,7 +1771,7 @@ def _prepare_encoder_hidden_states_kwargs_for_generation(
 
             # pad or truncate to config.chroma_length
             if audio_hidden_states.shape[1] < self.config.chroma_length:
-                n_repeat = int(math.ceil(self.config.chroma_length / audio_hidden_states.shape[1]))
+                n_repeat = int_div_ceil(self.config.chroma_length, audio_hidden_states.shape[1])
                 audio_hidden_states = audio_hidden_states.repeat(1, n_repeat, 1)
             audio_hidden_states = audio_hidden_states[:, : self.config.chroma_length]
 
diff --git a/src/transformers/models/nllb_moe/modeling_nllb_moe.py b/src/transformers/models/nllb_moe/modeling_nllb_moe.py
index 313b574518f4..548f5eef41b1 100644
--- a/src/transformers/models/nllb_moe/modeling_nllb_moe.py
+++ b/src/transformers/models/nllb_moe/modeling_nllb_moe.py
@@ -37,7 +37,7 @@
 )
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
-from ...utils import TransformersKwargs, auto_docstring, logging
+from ...utils import TransformersKwargs, auto_docstring, int_div_ceil, logging
 from ...utils.generic import can_return_tuple, merge_with_config_defaults
 from ...utils.output_capturing import OutputRecorder, capture_outputs
 from .configuration_nllb_moe import NllbMoeConfig
@@ -269,7 +269,7 @@ def route_tokens(
         if not self.training and self.moe_eval_capacity_token_fraction > 0:
             self.expert_capacity = math.ceil(self.moe_eval_capacity_token_fraction * nb_tokens)
         else:
-            capacity = 2 * math.ceil(nb_tokens / self.num_experts)
+            capacity = 2 * int_div_ceil(nb_tokens, self.num_experts)
             self.expert_capacity = capacity if self.expert_capacity is None else self.expert_capacity
 
         # Remove locations outside capacity from ( cumsum < capacity = False will not be routed)
diff --git a/src/transformers/models/pe_audio/modeling_pe_audio.py b/src/transformers/models/pe_audio/modeling_pe_audio.py
index e502073a95c7..ed4bf33b50a7 100644
--- a/src/transformers/models/pe_audio/modeling_pe_audio.py
+++ b/src/transformers/models/pe_audio/modeling_pe_audio.py
@@ -17,7 +17,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import math
 from collections.abc import Callable
 from dataclasses import dataclass
 from typing import Any, Optional
@@ -37,7 +36,7 @@
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
-from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple
+from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, int_div_ceil
 from ...utils.generic import maybe_autocast, merge_with_config_defaults
 from ...utils.output_capturing import capture_outputs
 from ..auto import AutoModel
@@ -110,7 +109,7 @@ def __init__(self, config: PreTrainedConfig, stride: int = 1, stride_index: int
         self.res_unit3 = PeAudioDacResidualUnit(dimension // 2, dilation=9)
         self.snake1 = Snake1d(dimension // 2)
         self.conv1 = nn.Conv1d(
-            dimension // 2, dimension, kernel_size=2 * stride, stride=stride, padding=math.ceil(stride / 2)
+            dimension // 2, dimension, kernel_size=2 * stride, stride=stride, padding=int_div_ceil(stride, 2)
         )
 
     def forward(self, hidden_state):
diff --git a/src/transformers/models/perceiver/modeling_perceiver.py b/src/transformers/models/perceiver/modeling_perceiver.py
index 02f67aca7908..200c14a637cc 100755
--- a/src/transformers/models/perceiver/modeling_perceiver.py
+++ b/src/transformers/models/perceiver/modeling_perceiver.py
@@ -31,7 +31,7 @@
 from ...modeling_outputs import BaseModelOutputWithCrossAttentions
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import apply_chunking_to_forward
-from ...utils import ModelOutput, auto_docstring, logging, torch_int
+from ...utils import ModelOutput, auto_docstring, int_div_ceil, logging, torch_int
 from .configuration_perceiver import PerceiverConfig
 
 
@@ -2965,7 +2965,7 @@ def num_channels(self) -> int:
         elif self.prep_type == "pixels":
             inp_dim = self.in_channels
             if not is_temporal:
-                inp_dim = math.ceil(inp_dim / self.spatial_downsample)
+                inp_dim = int_div_ceil(inp_dim, self.spatial_downsample)
         elif self.prep_type == "patches":
             if self.conv_after_patching:
                 inp_dim = self.out_channels
diff --git a/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py
index 27d02b9ada7f..15ec606245a7 100644
--- a/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py
+++ b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py
@@ -44,7 +44,7 @@
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
-from ...utils import auto_docstring, torch_int
+from ...utils import auto_docstring, int_div_ceil, torch_int
 from ...utils.generic import TransformersKwargs, can_return_tuple, maybe_autocast, merge_with_config_defaults
 from ...utils.output_capturing import capture_outputs
 from .configuration_phi4_multimodal import Phi4MultimodalAudioConfig, Phi4MultimodalConfig, Phi4MultimodalVisionConfig
@@ -954,7 +954,7 @@ def _streaming_mask(self, seq_len, batch_size, chunk_size, left_chunk):
 
     def forward_embeddings(self, hidden_states, masks):
         """Forwarding the inputs through the top embedding layers"""
-        seq_len = math.ceil(hidden_states.shape[1] / self.config.time_reduction)
+        seq_len = int_div_ceil(hidden_states.shape[1], self.config.time_reduction)
         if seq_len <= 0:
             raise ValueError(
                 f"The sequence length after time reduction is invalid: {seq_len}. Your input feature is too short."
diff --git a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py
index ef6bf1588c47..c01fa3fb3396 100644
--- a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py
+++ b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py
@@ -34,7 +34,7 @@
 from ...modeling_rope_utils import RopeParameters
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
-from ...utils import auto_docstring, logging
+from ...utils import auto_docstring, int_div_ceil, logging
 from ...utils.generic import (
     TransformersKwargs,
     can_return_tuple,
@@ -1063,7 +1063,7 @@ def _streaming_mask(self, seq_len, batch_size, chunk_size, left_chunk):
 
     def forward_embeddings(self, hidden_states, masks):
         """Forwarding the inputs through the top embedding layers"""
-        seq_len = math.ceil(hidden_states.shape[1] / self.config.time_reduction)
+        seq_len = int_div_ceil(hidden_states.shape[1], self.config.time_reduction)
         if seq_len <= 0:
             raise ValueError(
                 f"The sequence length after time reduction is invalid: {seq_len}. Your input feature is too short."
diff --git a/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py b/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py
index 9f545d272891..bc19a4abbbcc 100644
--- a/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py
+++ b/src/transformers/models/qwen3_vl/video_processing_qwen3_vl.py
@@ -21,7 +21,7 @@
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ChannelDimension, PILImageResampling, SizeDict, get_image_size
 from ...processing_utils import Unpack, VideosKwargs
-from ...utils import TensorType, add_start_docstrings, logging
+from ...utils import TensorType, add_start_docstrings, int_div_ceil, logging
 from ...video_processing_utils import BASE_VIDEO_PROCESSOR_DOCSTRING, BaseVideoProcessor
 from ...video_utils import VideoMetadata, group_videos_by_shape, reorder_videos
 
@@ -46,7 +46,7 @@ def smart_resize(
         )
     h_bar = round(height / factor) * factor
     w_bar = round(width / factor) * factor
-    t_bar = math.ceil(num_frames / temporal_factor) * temporal_factor
+    t_bar = int_div_ceil(num_frames, temporal_factor) * temporal_factor
 
     if t_bar * h_bar * w_bar > max_pixels:
         beta = math.sqrt((num_frames * height * width) / max_pixels)
diff --git a/src/transformers/models/smolvlm/image_processing_smolvlm.py b/src/transformers/models/smolvlm/image_processing_smolvlm.py
index c86beab858a2..a576bb884993 100644
--- a/src/transformers/models/smolvlm/image_processing_smolvlm.py
+++ b/src/transformers/models/smolvlm/image_processing_smolvlm.py
@@ -19,7 +19,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import math
 from collections.abc import Iterable
 
 import numpy as np
@@ -41,7 +40,7 @@
     validate_preprocess_arguments,
 )
 from ...processing_utils import ImagesKwargs
-from ...utils import TensorType, is_vision_available, logging
+from ...utils import TensorType, int_div_ceil, is_vision_available, logging
 
 
 if is_vision_available():
@@ -430,11 +429,11 @@ def split_image(
         frames = []
         if height > max_height or width > max_width:
             # Calculate the number of splits
-            num_splits_h = math.ceil(height / max_height)
-            num_splits_w = math.ceil(width / max_width)
+            num_splits_h = int_div_ceil(height, max_height)
+            num_splits_w = int_div_ceil(width, max_width)
             # Calculate the optimal width and height for the sub-images
-            optimal_height = math.ceil(height / num_splits_h)
-            optimal_width = math.ceil(width / num_splits_w)
+            optimal_height = int_div_ceil(height, num_splits_h)
+            optimal_width = int_div_ceil(width, num_splits_w)
 
             # Iterate through each row and column
             for r in range(num_splits_h):
@@ -501,13 +500,13 @@ def resize_for_vision_encoder(
 
         aspect_ratio = width / height
         if width >= height:
-            width = math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size
+            width = int_div_ceil(width, vision_encoder_max_size) * vision_encoder_max_size
             height = int(width / aspect_ratio)
-            height = math.ceil(height / vision_encoder_max_size) * vision_encoder_max_size
+            height = int_div_ceil(height, vision_encoder_max_size) * vision_encoder_max_size
         elif height > width:
-            height = math.ceil(height / vision_encoder_max_size) * vision_encoder_max_size
+            height = int_div_ceil(height, vision_encoder_max_size) * vision_encoder_max_size
             width = int(height * aspect_ratio)
-            width = math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size
+            width = int_div_ceil(width, vision_encoder_max_size) * vision_encoder_max_size
         new_size = {"height": height, "width": width}
         return self.resize(
             image, size=new_size, resample=resample, input_data_format=input_data_format, data_format=data_format
@@ -892,19 +891,19 @@ def get_number_of_image_patches(self, height: int, width: int, images_kwargs: di
             aspect_ratio = width / height
 
             if width >= height:
-                resized_width = math.ceil(width / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
+                resized_width = int_div_ceil(width, max_image_size["longest_edge"]) * max_image_size["longest_edge"]
                 resized_height = int(width / aspect_ratio)
-                resized_height = math.ceil(height / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
+                resized_height = int_div_ceil(height, max_image_size["longest_edge"]) * max_image_size["longest_edge"]
             elif height > width:
-                resized_height = math.ceil(height / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
+                resized_height = int_div_ceil(height, max_image_size["longest_edge"]) * max_image_size["longest_edge"]
                 resized_width = int(height * aspect_ratio)
-                resized_width = math.ceil(width / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
+                resized_width = int_div_ceil(width, max_image_size["longest_edge"]) * max_image_size["longest_edge"]
 
             max_height = max_width = max_image_size["longest_edge"]
             if resized_height > max_height or resized_width > max_width:
                 # Calculate the number of splits
-                num_rows = math.ceil(resized_height / max_height)
-                num_cols = math.ceil(resized_width / max_width)
+                num_rows = int_div_ceil(resized_height, max_height)
+                num_cols = int_div_ceil(resized_width, max_width)
                 num_patches = num_rows * num_cols + 1
 
         return num_patches, num_rows, num_cols
diff --git a/src/transformers/models/smolvlm/image_processing_smolvlm_fast.py b/src/transformers/models/smolvlm/image_processing_smolvlm_fast.py
index 59ba2fc1f154..af9aaa486f4d 100644
--- a/src/transformers/models/smolvlm/image_processing_smolvlm_fast.py
+++ b/src/transformers/models/smolvlm/image_processing_smolvlm_fast.py
@@ -19,7 +19,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import math
 from typing import Optional
 
 import torch
@@ -34,7 +33,7 @@
     make_nested_list_of_images,
 )
 from ...processing_utils import Unpack
-from ...utils import TensorType, auto_docstring, is_torchvision_available, logging
+from ...utils import TensorType, auto_docstring, int_div_ceil, is_torchvision_available, logging
 from .image_processing_smolvlm import SmolVLMImageProcessorKwargs
 
 
@@ -269,8 +268,8 @@ def split_images(
         frames = []
         if height > max_height or width > max_width:
             # Calculate the number of splits
-            num_splits_h = math.ceil(height / max_height)
-            num_splits_w = math.ceil(width / max_width)
+            num_splits_h = int_div_ceil(height, max_height)
+            num_splits_w = int_div_ceil(width, max_width)
 
             # Split the images by height, then by width
             frames = (
@@ -318,13 +317,13 @@ def resize_for_vision_encoder(
 
         aspect_ratio = width / height
         if width >= height:
-            width = math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size
+            width = int_div_ceil(width, vision_encoder_max_size) * vision_encoder_max_size
             height = int(width / aspect_ratio)
-            height = math.ceil(height / vision_encoder_max_size) * vision_encoder_max_size
+            height = int_div_ceil(height, vision_encoder_max_size) * vision_encoder_max_size
         elif height > width:
-            height = math.ceil(height / vision_encoder_max_size) * vision_encoder_max_size
+            height = int_div_ceil(height, vision_encoder_max_size) * vision_encoder_max_size
             width = int(height * aspect_ratio)
-            width = math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size
+            width = int_div_ceil(width, vision_encoder_max_size) * vision_encoder_max_size
         new_size = SizeDict(height=height, width=width)
         return self.resize(image, size=new_size, interpolation=interpolation)
 
@@ -515,19 +514,19 @@ def get_number_of_image_patches(self, height: int, width: int, images_kwargs: di
             aspect_ratio = width / height
 
             if width >= height:
-                resized_width = math.ceil(width / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
+                resized_width = int_div_ceil(width, max_image_size["longest_edge"]) * max_image_size["longest_edge"]
                 resized_height = int(width / aspect_ratio)
-                resized_height = math.ceil(height / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
+                resized_height = int_div_ceil(height, max_image_size["longest_edge"]) * max_image_size["longest_edge"]
             elif height > width:
-                resized_height = math.ceil(height / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
+                resized_height = int_div_ceil(height, max_image_size["longest_edge"]) * max_image_size["longest_edge"]
                 resized_width = int(height * aspect_ratio)
-                resized_width = math.ceil(width / max_image_size["longest_edge"]) * max_image_size["longest_edge"]
+                resized_width = int_div_ceil(width, max_image_size["longest_edge"]) * max_image_size["longest_edge"]
 
             max_height = max_width = max_image_size["longest_edge"]
             if resized_height > max_height or resized_width > max_width:
                 # Calculate the number of splits
-                num_rows = math.ceil(resized_height / max_height)
-                num_cols = math.ceil(resized_width / max_width)
+                num_rows = int_div_ceil(resized_height, max_height)
+                num_cols = int_div_ceil(resized_width, max_width)
                 num_patches = num_rows * num_cols + 1
 
         return num_patches, num_rows, num_cols
diff --git a/src/transformers/models/voxtral_realtime/modeling_voxtral_realtime.py b/src/transformers/models/voxtral_realtime/modeling_voxtral_realtime.py
index 78b120f96a03..61b082d71a9c 100644
--- a/src/transformers/models/voxtral_realtime/modeling_voxtral_realtime.py
+++ b/src/transformers/models/voxtral_realtime/modeling_voxtral_realtime.py
@@ -39,7 +39,14 @@
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
-from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging
+from ...utils import (
+    TransformersKwargs,
+    auto_docstring,
+    can_return_tuple,
+    int_div_ceil,
+    is_torchdynamo_compiling,
+    logging,
+)
 from ...utils.generic import maybe_autocast, merge_with_config_defaults
 from ...utils.output_capturing import capture_outputs
 from ..auto import AutoModel
@@ -1251,7 +1258,7 @@ def _prepare_generation_config(
         input_features = model_kwargs.get("input_features")
         if input_features is not None and not isinstance(input_features, GeneratorType):
             audio_length = input_features.shape[-1]
-            num_audio_tokens = math.ceil(audio_length / self.config.audio_length_per_tok)
+            num_audio_tokens = int_div_ceil(audio_length, self.config.audio_length_per_tok)
             # Stash for use in _prepare_generated_length
             generation_config._num_audio_tokens = num_audio_tokens
 
diff --git a/src/transformers/models/voxtral_realtime/modular_voxtral_realtime.py b/src/transformers/models/voxtral_realtime/modular_voxtral_realtime.py
index 85c6007a3edc..be96ed793e3f 100644
--- a/src/transformers/models/voxtral_realtime/modular_voxtral_realtime.py
+++ b/src/transformers/models/voxtral_realtime/modular_voxtral_realtime.py
@@ -42,7 +42,14 @@
     VoxtralPreTrainedModel,
 )
 from ...processing_utils import Unpack
-from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging
+from ...utils import (
+    TransformersKwargs,
+    auto_docstring,
+    can_return_tuple,
+    int_div_ceil,
+    is_torchdynamo_compiling,
+    logging,
+)
 from ...utils.generic import merge_with_config_defaults
 from ...utils.output_capturing import capture_outputs
 from .configuration_voxtral_realtime import VoxtralRealtimeEncoderConfig
@@ -816,7 +823,7 @@ def _prepare_generation_config(
         input_features = model_kwargs.get("input_features")
         if input_features is not None and not isinstance(input_features, GeneratorType):
             audio_length = input_features.shape[-1]
-            num_audio_tokens = math.ceil(audio_length / self.config.audio_length_per_tok)
+            num_audio_tokens = int_div_ceil(audio_length, self.config.audio_length_per_tok)
             # Stash for use in _prepare_generated_length
             generation_config._num_audio_tokens = num_audio_tokens
 
diff --git a/src/transformers/models/xcodec/configuration_xcodec.py b/src/transformers/models/xcodec/configuration_xcodec.py
index 6c0479d425a6..8edea22df1c6 100644
--- a/src/transformers/models/xcodec/configuration_xcodec.py
+++ b/src/transformers/models/xcodec/configuration_xcodec.py
@@ -18,7 +18,7 @@
 import numpy as np
 
 from ...configuration_utils import PreTrainedConfig
-from ...utils import auto_docstring
+from ...utils import auto_docstring, int_div_ceil
 from ..auto import CONFIG_MAPPING, AutoConfig
 
 
@@ -128,7 +128,7 @@ def __init__(
 
     @property
     def frame_rate(self) -> int:
-        return math.ceil(self.sample_rate / self.hop_length)
+        return int_div_ceil(self.sample_rate, self.hop_length)
 
     @property
     def semantic_hidden_size(self) -> int:
diff --git a/src/transformers/models/zamba/configuration_zamba.py b/src/transformers/models/zamba/configuration_zamba.py
index eeb652d47c70..54dfe8b29d38 100644
--- a/src/transformers/models/zamba/configuration_zamba.py
+++ b/src/transformers/models/zamba/configuration_zamba.py
@@ -13,10 +13,9 @@
 # limitations under the License.
 """Zamba model configuration"""
 
-import math
 
 from ...configuration_utils import PreTrainedConfig
-from ...utils import auto_docstring, logging
+from ...utils import auto_docstring, int_div_ceil, logging
 
 
 logger = logging.get_logger(__name__)
@@ -125,7 +124,7 @@ def __init__(
         self.mamba_d_state = mamba_d_state
         self.mamba_d_conv = mamba_d_conv
         self.mamba_expand = mamba_expand
-        self.mamba_dt_rank = math.ceil(self.hidden_size / 16) if mamba_dt_rank == "auto" else mamba_dt_rank
+        self.mamba_dt_rank = int_div_ceil(self.hidden_size, 16) if mamba_dt_rank == "auto" else mamba_dt_rank
         self.time_step_min = time_step_min
         self.time_step_max = time_step_max
         self.time_step_floor = time_step_floor
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 10d1938f8732..5214ae0c4afb 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -161,6 +161,7 @@
     can_return_loss,
     check_torch_load_is_safe,
     find_labels,
+    int_div_ceil,
     is_accelerate_available,
     is_datasets_available,
     is_in_notebook,
@@ -2590,7 +2591,7 @@ def evaluate(
                 metric_key_prefix,
                 start_time,
                 num_samples=output.num_samples,
-                num_steps=math.ceil(output.num_samples / total_batch_size),
+                num_steps=int_div_ceil(output.num_samples, total_batch_size),
             )
         )
 
@@ -2864,7 +2865,7 @@ def predict(
                 metric_key_prefix,
                 start_time,
                 num_samples=output.num_samples,
-                num_steps=math.ceil(output.num_samples / total_batch_size),
+                num_steps=int_div_ceil(output.num_samples, total_batch_size),
             )
         )
 
diff --git a/src/transformers/trainer_pt_utils.py b/src/transformers/trainer_pt_utils.py
index 30377f5f5a61..d107a5941b69 100644
--- a/src/transformers/trainer_pt_utils.py
+++ b/src/transformers/trainer_pt_utils.py
@@ -20,7 +20,6 @@
 import datetime
 import io
 import json
-import math
 import os
 import re
 import sys
@@ -43,6 +42,7 @@
 from .integrations.deepspeed import is_deepspeed_zero3_enabled
 from .tokenization_utils_base import BatchEncoding
 from .utils import (
+    int_div_ceil,
     is_sagemaker_mp_enabled,
     is_torch_available,
     is_torch_xla_available,
@@ -612,9 +612,9 @@ def __init__(
             # Split to nearest available length that is evenly divisible.
             # This is to ensure each rank receives the same amount of data when
             # using this Sampler.
-            self.num_samples = math.ceil((len(self.lengths) - self.num_replicas) / self.num_replicas)
+            self.num_samples = int_div_ceil(len(self.lengths) - self.num_replicas, self.num_replicas)
         else:
-            self.num_samples = math.ceil(len(self.lengths) / self.num_replicas)
+            self.num_samples = int_div_ceil(len(self.lengths), self.num_replicas)
         self.total_size = self.num_samples * self.num_replicas
         self.seed = seed
 
@@ -664,7 +664,7 @@ def __init__(
 
         self.total_batch_size = total_batch_size = batch_size * num_processes
 
-        num_batches = len(dataset) // total_batch_size if drop_last else math.ceil(len(dataset) / total_batch_size)
+        num_batches = len(dataset) // total_batch_size if drop_last else int_div_ceil(len(dataset), total_batch_size)
         self.total_num_samples = num_batches * total_batch_size
 
     def __iter__(self):
@@ -788,7 +788,7 @@ def __len__(self):
         if self.drop_last:
             return (len(self.dataset) // (self.batch_size * self.num_processes)) * self.batch_size
         else:
-            return math.ceil(len(self.dataset) / (self.batch_size * self.num_processes)) * self.batch_size
+            return int_div_ceil(len(self.dataset), self.batch_size * self.num_processes) * self.batch_size
 
 
 def _secs2timedelta(secs):
diff --git a/src/transformers/utils/generic.py b/src/transformers/utils/generic.py
index 2dd6af34eb8a..971f1967c460 100644
--- a/src/transformers/utils/generic.py
+++ b/src/transformers/utils/generic.py
@@ -665,7 +665,7 @@ def torch_float(x):
     return x.to(torch.float32) if torch.jit.is_tracing() and isinstance(x, torch.Tensor) else int(x)
 
 
-def int_div_ceil(a: int | "torch.Tensor", b: int | "torch.Tensor") -> int:
+def int_div_ceil(a: int | torch.Tensor, b: int | torch.Tensor) -> int:
     """
     Perform integer ceiling division without intermediate floating-point conversion.
 

From 319220d8ea172094460c5786ad7d30462ec17ccb Mon Sep 17 00:00:00 2001
From: Joshua Lochner <26504141+xenova@users.noreply.github.com>
Date: Sun, 15 Mar 2026 16:32:00 -0400
Subject: [PATCH 3/5] formatting

---
 src/transformers/models/fuyu/image_processing_fuyu.py  | 1 -
 src/transformers/models/jamba/configuration_jamba.py   | 1 -
 src/transformers/models/mamba/configuration_mamba.py   | 1 -
 src/transformers/models/mamba2/configuration_mamba2.py | 1 -
 src/transformers/models/zamba/configuration_zamba.py   | 1 -
 5 files changed, 5 deletions(-)

diff --git a/src/transformers/models/fuyu/image_processing_fuyu.py b/src/transformers/models/fuyu/image_processing_fuyu.py
index 3b0aeb6a9865..c63f1b2e54ac 100644
--- a/src/transformers/models/fuyu/image_processing_fuyu.py
+++ b/src/transformers/models/fuyu/image_processing_fuyu.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 """Image processor class for Fuyu."""
 
-
 import numpy as np
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
diff --git a/src/transformers/models/jamba/configuration_jamba.py b/src/transformers/models/jamba/configuration_jamba.py
index a97f92ff4bdb..e536cc1d75d1 100644
--- a/src/transformers/models/jamba/configuration_jamba.py
+++ b/src/transformers/models/jamba/configuration_jamba.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 """Jamba model configuration"""
 
-
 from ...configuration_utils import PreTrainedConfig
 from ...utils import auto_docstring, int_div_ceil, logging
 
diff --git a/src/transformers/models/mamba/configuration_mamba.py b/src/transformers/models/mamba/configuration_mamba.py
index c3997924b5fd..cc90e05cd7fc 100644
--- a/src/transformers/models/mamba/configuration_mamba.py
+++ b/src/transformers/models/mamba/configuration_mamba.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 """MAMBA configuration"""
 
-
 from ...configuration_utils import PreTrainedConfig
 from ...utils import auto_docstring, int_div_ceil, logging
 
diff --git a/src/transformers/models/mamba2/configuration_mamba2.py b/src/transformers/models/mamba2/configuration_mamba2.py
index 96bfe4f23f7b..981f592e9687 100644
--- a/src/transformers/models/mamba2/configuration_mamba2.py
+++ b/src/transformers/models/mamba2/configuration_mamba2.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 """MAMBA2 configuration"""
 
-
 from ...configuration_utils import PreTrainedConfig
 from ...utils import auto_docstring, int_div_ceil, logging
 
diff --git a/src/transformers/models/zamba/configuration_zamba.py b/src/transformers/models/zamba/configuration_zamba.py
index 54dfe8b29d38..16534bc4d4d5 100644
--- a/src/transformers/models/zamba/configuration_zamba.py
+++ b/src/transformers/models/zamba/configuration_zamba.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 """Zamba model configuration"""
 
-
 from ...configuration_utils import PreTrainedConfig
 from ...utils import auto_docstring, int_div_ceil, logging
 

From a54b03e84dc9554a2f69c3a68770c2d1c0ca946e Mon Sep 17 00:00:00 2001
From: Joshua Lochner <26504141+xenova@users.noreply.github.com>
Date: Sun, 15 Mar 2026 16:36:38 -0400
Subject: [PATCH 4/5] formatting? ci mad

---
 src/transformers/models/metaclip_2/convert_metaclip_2_to_hf.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/metaclip_2/convert_metaclip_2_to_hf.py b/src/transformers/models/metaclip_2/convert_metaclip_2_to_hf.py
index 6db36199dca3..543666f8750e 100644
--- a/src/transformers/models/metaclip_2/convert_metaclip_2_to_hf.py
+++ b/src/transformers/models/metaclip_2/convert_metaclip_2_to_hf.py
@@ -33,7 +33,6 @@
     MetaClip2Model,
 )
 
-
 def load_metaclip2_checkpoint(checkpoint_path: str, model_name: str) -> torch.nn.Module:
     """Load MetaCLIP 2 model from checkpoint."""
     print(f"Loading MetaCLIP 2 model: {model_name}")

From 2888c627e9987686537f95946e7d82cdfe5729fd Mon Sep 17 00:00:00 2001
From: Joshua Lochner <26504141+xenova@users.noreply.github.com>
Date: Sun, 15 Mar 2026 16:39:38 -0400
Subject: [PATCH 5/5] undo

---
 src/transformers/models/metaclip_2/convert_metaclip_2_to_hf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/metaclip_2/convert_metaclip_2_to_hf.py b/src/transformers/models/metaclip_2/convert_metaclip_2_to_hf.py
index 543666f8750e..6db36199dca3 100644
--- a/src/transformers/models/metaclip_2/convert_metaclip_2_to_hf.py
+++ b/src/transformers/models/metaclip_2/convert_metaclip_2_to_hf.py
@@ -33,6 +33,7 @@
     MetaClip2Model,
 )
 
+
 def load_metaclip2_checkpoint(checkpoint_path: str, model_name: str) -> torch.nn.Module:
     """Load MetaCLIP 2 model from checkpoint."""
     print(f"Loading MetaCLIP 2 model: {model_name}")