huggingface · xenova · Mar 15, 2026 · Mar 15, 2026 · Mar 15, 2026 · Mar 15, 2026
diff --git a/src/transformers/core_model_loading.py b/src/transformers/core_model_loading.py
@@ -15,7 +15,6 @@
 
 from __future__ import annotations
 
-import math
 import os
 import re
 import traceback
@@ -33,7 +32,7 @@
 
 from .integrations.accelerate import get_device, offload_weight
 from .integrations.tensor_parallel import ALL_PARALLEL_STYLES
-from .utils import is_env_variable_true
+from .utils import int_div_ceil, is_env_variable_true
 from .utils.loading_report import LoadStateDictInfo
 from .utils.logging import get_logger, tqdm
 
@@ -354,7 +353,7 @@ def __init__(self, stack_dim: int = 0, concat_dim: int = 1):
         self.concat_dim = concat_dim
 
     def split_list_into_chunks(self, tensor_list: list[torch.Tensor], chunks: int = 2):
-        split_size = math.ceil(len(tensor_list) / chunks)  # best effort split size
+        split_size = int_div_ceil(len(tensor_list), chunks)  # best effort split size
         return [tensor_list[i * split_size : (i + 1) * split_size] for i in range(chunks)]
 
     @torch.no_grad()

diff --git a/src/transformers/integrations/tensor_parallel.py b/src/transformers/integrations/tensor_parallel.py
@@ -13,14 +13,13 @@
 # limitations under the License.
 from __future__ import annotations
 
-import math
 import operator
 import os
 import re
 from functools import reduce
 
 from ..distributed import DistributedConfig
-from ..utils import is_torch_greater_or_equal, logging
+from ..utils import int_div_ceil, is_torch_greater_or_equal, logging
 from ..utils.generic import GeneralInterface
 from ..utils.import_utils import is_torch_available
 
@@ -374,7 +373,7 @@ def get_tensor_shard(param, empty_param, device_mesh, rank, dim, tensor_idx: int
     elif empty_param.dim() == 3 and dim == 2 and len(param_shape) == 2:
         dim = 1
 
-    shard_size = math.ceil(param_shape[dim] / world_size)
+    shard_size = int_div_ceil(param_shape[dim], world_size)
     start = rank * shard_size
     end = min(start + shard_size, param_shape[dim])
 
@@ -723,7 +722,7 @@ def get_expected_sharded_shape(self, full_shape: tuple[int, ...] | torch.Size) -
         # Colwise shards dim -2, but 1D tensors (bias) shard on dim -1
         dim = -1 if len(shape) == 1 else -2
         dim = len(shape) + dim if dim < 0 else dim
-        shard_size = math.ceil(shape[dim] / world_size)
+        shard_size = int_div_ceil(shape[dim], world_size)
         start = self.rank * shard_size
         end = min(start + shard_size, shape[dim])
         shape[dim] = end - start
@@ -866,7 +865,7 @@ def get_expected_sharded_shape(self, full_shape: tuple[int, ...] | torch.Size) -
         shape = list(full_shape)
         dim = -1
         dim = len(shape) + dim if dim < 0 else dim
-        shard_size = math.ceil(shape[dim] / world_size)
+        shard_size = int_div_ceil(shape[dim], world_size)
         start = self.rank * shard_size
         end = min(start + shard_size, shape[dim])
         shape[dim] = end - start
@@ -996,7 +995,7 @@ def get_expected_sharded_shape(self, full_shape: tuple[int, ...] | torch.Size) -
         # 1D tensors (bias) shard on dim -1
         dim = -1 if len(shape) == 1 else self.embedding_dim_sharding
         dim = len(shape) + dim if dim < 0 else dim
-        shard_size = math.ceil(shape[dim] / world_size)
+        shard_size = int_div_ceil(shape[dim], world_size)
         start = self.rank * shard_size
         end = min(start + shard_size, shape[dim])
         shape[dim] = end - start

diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
@@ -30,7 +30,7 @@
     ImageClassifierOutputWithNoAttention,
 )
 from ...modeling_utils import PreTrainedModel
-from ...utils import auto_docstring, logging
+from ...utils import auto_docstring, int_div_ceil, logging
 from ...utils.generic import can_return_tuple
 from .configuration_bit import BitConfig
 
@@ -169,7 +169,7 @@ def __init__(self, kernel_size, stride, dilation, value=0):
         self.value = value
 
         def compute_padding(x, kernel_size, stride, dilation):
-            return max((math.ceil(x / stride) - 1) * stride + (kernel_size - 1) * dilation + 1 - x, 0)
+            return max((int_div_ceil(x, stride) - 1) * stride + (kernel_size - 1) * dilation + 1 - x, 0)
 
         self.compute_padding = compute_padding
 

diff --git a/src/transformers/models/chmv2/image_processing_chmv2_fast.py b/src/transformers/models/chmv2/image_processing_chmv2_fast.py
@@ -31,7 +31,7 @@
 from ...image_utils import ChannelDimension, ImageInput, PILImageResampling, SizeDict, is_torch_tensor
 from ...modeling_outputs import DepthEstimatorOutput
 from ...processing_utils import Unpack
-from ...utils import TensorType, auto_docstring, requires_backends
+from ...utils import TensorType, auto_docstring, int_div_ceil, requires_backends
 from .image_processing_chmv2 import CHMv2ImageProcessorKwargs
 
 
@@ -48,7 +48,7 @@ def constrain_to_multiple_of(val, multiple, min_val=0, max_val=None):
             x = math.floor(val / multiple) * multiple
 
         if x < min_val:
-            x = math.ceil(val / multiple) * multiple
+            x = int_div_ceil(val, multiple) * multiple
 
         return x
 
@@ -313,7 +313,7 @@ def pad_image(
         height, width = image.shape[-2:]
 
         def _get_pad(size, size_divisor):
-            new_size = math.ceil(size / size_divisor) * size_divisor
+            new_size = int_div_ceil(size, size_divisor) * size_divisor
             pad_size = new_size - size
             pad_size_left = pad_size // 2
             pad_size_right = pad_size - pad_size_left

diff --git a/src/transformers/models/dac/configuration_dac.py b/src/transformers/models/dac/configuration_dac.py
@@ -13,12 +13,10 @@
 # limitations under the License.
 """Dac model configuration"""
 
-import math
-
 import numpy as np
 
 from ...configuration_utils import PreTrainedConfig
-from ...utils import auto_docstring, logging
+from ...utils import auto_docstring, int_div_ceil, logging
 
 
 logger = logging.get_logger(__name__)
@@ -88,7 +86,7 @@ def __init__(
     @property
     def frame_rate(self) -> int:
         hop_length = np.prod(self.upsampling_ratios)
-        return math.ceil(self.sampling_rate / hop_length)
+        return int_div_ceil(self.sampling_rate, int(hop_length))
 
 
 __all__ = ["DacConfig"]
diff --git a/src/transformers/models/dac/modeling_dac.py b/src/transformers/models/dac/modeling_dac.py
@@ -23,7 +23,7 @@
 
 from ... import initialization as init
 from ...modeling_utils import PreTrainedAudioTokenizerBase
-from ...utils import ModelOutput, auto_docstring
+from ...utils import ModelOutput, auto_docstring, int_div_ceil
 from .configuration_dac import DacConfig
 
 
@@ -219,7 +219,7 @@ def __init__(self, config: DacConfig, stride: int = 1, stride_index: int = 1):
         self.res_unit3 = DacResidualUnit(dimension // 2, dilation=9)
         self.snake1 = Snake1d(dimension // 2)
         self.conv1 = nn.Conv1d(
-            dimension // 2, dimension, kernel_size=2 * stride, stride=stride, padding=math.ceil(stride / 2)
+            dimension // 2, dimension, kernel_size=2 * stride, stride=stride, padding=int_div_ceil(stride, 2)
         )
 
     def forward(self, hidden_state):
@@ -245,7 +245,7 @@ def __init__(self, config: DacConfig, stride: int = 1, stride_index: int = 1):
             output_dim,
             kernel_size=2 * stride,
             stride=stride,
-            padding=math.ceil(stride / 2),
+            padding=int_div_ceil(stride, 2),
         )
 
         self.res_unit1 = DacResidualUnit(output_dim, dilation=1)

diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -22,7 +22,7 @@
 
 from ... import initialization as init
 from ...modeling_utils import PreTrainedModel
-from ...utils import ModelOutput, auto_docstring, logging, torch_int
+from ...utils import ModelOutput, auto_docstring, int_div_ceil, logging, torch_int
 from ..auto import AutoModel
 from .configuration_depth_pro import DepthProConfig
 
@@ -895,16 +895,16 @@ def __init__(self, config: DepthProConfig):
         for i in range(config.num_fov_head_layers):
             self.layers.append(
                 nn.Conv2d(
-                    math.ceil(self.fusion_hidden_size / 2 ** (i + 1)),
-                    math.ceil(self.fusion_hidden_size / 2 ** (i + 2)),
+                    int_div_ceil(self.fusion_hidden_size, 2 ** (i + 1)),
+                    int_div_ceil(self.fusion_hidden_size, 2 ** (i + 2)),
                     kernel_size=3,
                     stride=2,
                     padding=1,
                 )
             )
             self.layers.append(nn.ReLU(True))
         # calculate expected shapes to finally generate a scalar output from final head layer
-        final_in_channels = math.ceil(self.fusion_hidden_size / 2 ** (config.num_fov_head_layers + 1))
+        final_in_channels = int_div_ceil(self.fusion_hidden_size, 2 ** (config.num_fov_head_layers + 1))
         final_kernel_size = torch_int((self.out_size - 1) / 2**config.num_fov_head_layers + 1)
         self.layers.append(
             nn.Conv2d(

diff --git a/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py b/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py
@@ -16,7 +16,6 @@
 
 import argparse
 import itertools
-import math
 from io import BytesIO
 from pathlib import Path
 
@@ -26,7 +25,7 @@
 from torchvision import transforms
 
 from transformers import Dinov2Config, DPTConfig, DPTForDepthEstimation, DPTImageProcessor
-from transformers.utils import logging
+from transformers.utils import int_div_ceil, logging
 
 
 logging.set_verbosity_info()
@@ -207,7 +206,7 @@ def __init__(self, multiple):
             self.multiple = multiple
 
         def _get_pad(self, size):
-            new_size = math.ceil(size / self.multiple) * self.multiple
+            new_size = int_div_ceil(size, self.multiple) * self.multiple
             pad_size = new_size - size
             pad_size_left = pad_size // 2
             pad_size_right = pad_size - pad_size_left

diff --git a/src/transformers/models/dpt/image_processing_dpt.py b/src/transformers/models/dpt/image_processing_dpt.py
@@ -47,6 +47,7 @@
 from ...utils import (
     TensorType,
     filter_out_non_signature_kwargs,
+    int_div_ceil,
     is_vision_available,
     logging,
     requires_backends,
@@ -97,7 +98,7 @@ def constrain_to_multiple_of(val, multiple, min_val=0, max_val=None):
             x = math.floor(val / multiple) * multiple
 
         if x < min_val:
-            x = math.ceil(val / multiple) * multiple
+            x = int_div_ceil(val, multiple) * multiple
 
         return x
 
@@ -289,7 +290,7 @@ def pad_image(
         """
 
         def _get_pad(size, size_divisor):
-            new_size = math.ceil(size / size_divisor) * size_divisor
+            new_size = int_div_ceil(size, size_divisor) * size_divisor
             pad_size = new_size - size
             pad_size_left = pad_size // 2
             pad_size_right = pad_size - pad_size_left

diff --git a/src/transformers/models/dpt/image_processing_dpt_fast.py b/src/transformers/models/dpt/image_processing_dpt_fast.py
@@ -39,7 +39,7 @@
     is_torch_tensor,
 )
 from ...processing_utils import Unpack
-from ...utils import TensorType, auto_docstring, requires_backends
+from ...utils import TensorType, auto_docstring, int_div_ceil, requires_backends
 from .image_processing_dpt import DPTImageProcessorKwargs
 
 
@@ -60,7 +60,7 @@ def constrain_to_multiple_of(val, multiple, min_val=0, max_val=None):
             x = math.floor(val / multiple) * multiple
 
         if x < min_val:
-            x = math.ceil(val / multiple) * multiple
+            x = int_div_ceil(val, multiple) * multiple
 
         return x
 
@@ -325,7 +325,7 @@ def pad_image(
         height, width = image.shape[-2:]
 
         def _get_pad(size, size_divisor):
-            new_size = math.ceil(size / size_divisor) * size_divisor
+            new_size = int_div_ceil(size, size_divisor) * size_divisor
             pad_size = new_size - size
             pad_size_left = pad_size // 2
             pad_size_right = pad_size - pad_size_left

diff --git a/src/transformers/models/dpt/modular_dpt.py b/src/transformers/models/dpt/modular_dpt.py
@@ -31,6 +31,7 @@
 from ...utils import (
     TensorType,
     auto_docstring,
+    int_div_ceil,
     requires_backends,
 )
 from ..beit.image_processing_beit_fast import BeitImageProcessorFast
@@ -56,7 +57,7 @@ def constrain_to_multiple_of(val, multiple, min_val=0, max_val=None):
             x = math.floor(val / multiple) * multiple
 
         if x < min_val:
-            x = math.ceil(val / multiple) * multiple
+            x = int_div_ceil(val, multiple) * multiple
 
         return x
 
@@ -160,7 +161,7 @@ def pad_image(
         height, width = image.shape[-2:]
 
         def _get_pad(size, size_divisor):
-            new_size = math.ceil(size / size_divisor) * size_divisor
+            new_size = int_div_ceil(size, size_divisor) * size_divisor
             pad_size = new_size - size
             pad_size_left = pad_size // 2
             pad_size_right = pad_size - pad_size_left

diff --git a/src/transformers/models/encodec/configuration_encodec.py b/src/transformers/models/encodec/configuration_encodec.py
@@ -18,7 +18,7 @@
 import numpy as np
 
 from ...configuration_utils import PreTrainedConfig
-from ...utils import auto_docstring, logging
+from ...utils import auto_docstring, int_div_ceil, logging
 
 
 logger = logging.get_logger(__name__)
@@ -169,7 +169,7 @@ def codebook_nbits(self) -> int:
 
     @property
     def frame_rate(self) -> int:
-        return math.ceil(self.sampling_rate / self.hop_length)
+        return int_div_ceil(self.sampling_rate, self.hop_length)
 
     @property
     def num_quantizers(self) -> int:

diff --git a/src/transformers/models/eomt/image_processing_eomt.py b/src/transformers/models/eomt/image_processing_eomt.py
@@ -13,8 +13,6 @@
 # limitations under the License.
 """Image processor class for EoMT."""
 
-import math
-
 import numpy as np
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
@@ -41,6 +39,7 @@
     IMAGENET_DEFAULT_STD,
     TensorType,
     filter_out_non_signature_kwargs,
+    int_div_ceil,
     is_torch_available,
     logging,
 )
@@ -347,7 +346,7 @@ def _split_image(self, image: ImageInput, size: dict, image_index: int) -> tuple
         patch_size = size["shortest_edge"]
 
         longer_side = max(image_size)
-        num_patches = math.ceil(longer_side / patch_size)
+        num_patches = int_div_ceil(longer_side, patch_size)
         total_overlap = num_patches * patch_size - longer_side
         overlap_per_patch = total_overlap / (num_patches - 1) if num_patches > 1 else 0
 

diff --git a/src/transformers/models/eomt/image_processing_eomt_fast.py b/src/transformers/models/eomt/image_processing_eomt_fast.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 """Fast Image processor class for EoMT."""
 
-import math
 from typing import Optional, Union
 
 import numpy as np
@@ -39,6 +38,7 @@
     TensorType,
     auto_docstring,
     filter_out_non_signature_kwargs,
+    int_div_ceil,
 )
 from .image_processing_eomt import (
     EomtImageProcessorKwargs,
@@ -127,7 +127,7 @@ def _split_image(self, images: torch.Tensor, size: dict, image_indices: int) ->
         patch_size = size["shortest_edge"]
 
         longer_side = max(height, width)
-        num_patches = math.ceil(longer_side / patch_size)
+        num_patches = int_div_ceil(longer_side, patch_size)
         total_overlap = num_patches * patch_size - longer_side
         overlap_per_patch = total_overlap / (num_patches - 1) if num_patches > 1 else 0
 

diff --git a/src/transformers/models/falcon_mamba/configuration_falcon_mamba.py b/src/transformers/models/falcon_mamba/configuration_falcon_mamba.py
@@ -17,10 +17,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import math
-
 from ...configuration_utils import PreTrainedConfig
-from ...utils import auto_docstring
+from ...utils import auto_docstring, int_div_ceil
 
 
 @auto_docstring(checkpoint="tiiuae/falcon-mamba-7b")
@@ -117,7 +115,7 @@ def __init__(
         self.use_conv_bias = use_conv_bias
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
-        self.time_step_rank = math.ceil(self.hidden_size / 16) if time_step_rank == "auto" else time_step_rank
+        self.time_step_rank = int_div_ceil(self.hidden_size, 16) if time_step_rank == "auto" else time_step_rank
         self.time_step_scale = time_step_scale
         self.time_step_min = time_step_min
         self.time_step_max = time_step_max