huggingface · eustlb · Mar 2, 2026 · Mar 2, 2026 · Mar 4, 2026 · Mar 4, 2026
diff --git a/.gitignore b/.gitignore
@@ -176,3 +176,7 @@ tags
 # Cursor IDE files
 .cursor/
 test-results/
+src/transformers/models/audio_spectrogram_transformer/audio_processing_audio_spectrogram_transformer.py
+.gitignore
+tests/test_wav2vec2_whisper.py
+run_preprocessing_tests.sh
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -355,6 +355,7 @@
 
     _import_structure["utils.dummy_pt_objects"] = [name for name in dir(dummy_pt_objects) if not name.startswith("_")]
 else:
+    _import_structure["audio_processing_backends"] = ["NumpyAudioBackend", "NumpyBackend", "TorchAudioBackend", "TorchBackend"]
     _import_structure["model_debugging_utils"] = [
         "model_addition_debugger_context",
     ]
@@ -477,6 +478,10 @@
 if TYPE_CHECKING:
     # All modeling imports
     # Models
+    from .audio_processing_backends import NumpyAudioBackend as NumpyAudioBackend
+    from .audio_processing_backends import NumpyBackend as NumpyBackend
+    from .audio_processing_backends import TorchAudioBackend as TorchAudioBackend
+    from .audio_processing_backends import TorchBackend as TorchBackend
     from .backbone_utils import BackboneConfigMixin, BackboneMixin
     from .cache_utils import Cache as Cache
     from .cache_utils import DynamicCache as DynamicCache

diff --git a/src/transformers/audio_processing_backends.py b/src/transformers/audio_processing_backends.py
diff --git a/src/transformers/audio_processing_base.py b/src/transformers/audio_processing_base.py
@@ -0,0 +1,148 @@
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import warnings
+from typing import Any, ClassVar, TypeVar
+
+from .audio_utils import is_valid_audio, load_audio
+from .feature_extraction_utils import BatchFeature as BaseBatchFeature
+from .preprocessing_base import PreprocessingMixin
+from .utils import (
+    FEATURE_EXTRACTOR_NAME,
+    copy_func,
+    logging,
+)
+
+
+_LEGACY_KEY_MAP = {
+    "input_features": "audio_features",
+    "input_values": "audio_values",
+    "audio_input_features": "audio_features",
+}
+
+
+AudioProcessorType = TypeVar("AudioProcessorType", bound="AudioProcessingMixin")
+
+
+logger = logging.get_logger(__name__)
+
+
+class BatchFeature(BaseBatchFeature):
+    r"""
+    Holds the output of the audio processor specific `__call__` methods.
+
+    This class is derived from a python dictionary and can be used as a dictionary.
+
+    Args:
+        data (`dict`):
+            Dictionary of lists/arrays/tensors returned by the __call__ method ('input_values', 'input_features', etc.).
+        tensor_type (`Union[None, str, TensorType]`, *optional*):
+            You can give a tensor_type here to convert the lists of integers in PyTorch/Numpy Tensors at
+            initialization.
+    """
+
+    _warned_keys: ClassVar[set] = set()
+
+    def __getitem__(self, item):
+        if isinstance(item, str) and item not in self.data:
+            new_key = self._resolve_legacy_key(item)
+            if new_key is not None and new_key in self.data:
+                if item not in BatchFeature._warned_keys:
+                    warnings.warn(
+                        f"Accessing '{item}' is deprecated, use '{new_key}' instead.",
+                        FutureWarning,
+                        stacklevel=2,
+                    )
+                    BatchFeature._warned_keys.add(item)
+                return self.data[new_key]
+        return super().__getitem__(item)
+
+    def __contains__(self, item):
+        if item in self.data:
+            return True
+        new_key = self._resolve_legacy_key(item)
+        return new_key is not None and new_key in self.data
+
+    def _resolve_legacy_key(self, old_key):
+        if old_key in ("attention_mask", "padding_mask"):
+            if "audio_features_mask" in self.data:
+                return "audio_features_mask"
+            if "audio_values_mask" in self.data:
+                return "audio_values_mask"
+            return None
+        return _LEGACY_KEY_MAP.get(old_key)
+
+
+class AudioProcessingMixin(PreprocessingMixin):
+    """
+    This is an audio processor mixin used to provide saving/loading functionality for audio processors.
+    """
+
+    _config_name = FEATURE_EXTRACTOR_NAME
+    _type_key = "audio_processor_type"
+    _nested_config_keys = ["audio_processor", "feature_extractor"]
+    _auto_class_default = "AutoFeatureExtractor"
+    _file_type_label = "audio processor"
+    _excluded_dict_keys = {"mel_filters", "window"}
+    _extra_init_pops = ["feature_extractor_type"]
+    _config_filename_kwarg = "audio_processor_filename"
+    _subfolder_default = ""
+
+    @classmethod
+    def get_audio_processor_dict(
+        cls, pretrained_model_name_or_path: str | os.PathLike, **kwargs
+    ) -> tuple[dict[str, Any], dict[str, Any]]:
+        """
+        From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating an
+        audio processor of type [`~audio_processing_base.AudioProcessingMixin`] using `from_dict`.
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
+            subfolder (`str`, *optional*, defaults to `""`):
+                In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
+                specify the folder name here.
+            audio_processor_filename (`str`, *optional*, defaults to `"preprocessor_config.json"`):
+                The name of the file in the model directory to use for the audio processor config.
+
+        Returns:
+            `tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the audio processor object.
+        """
+        return cls._get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+    def fetch_audio(self, audio_url_or_urls: str | list[str] | list[list[str]], sampling_rate: int | None = None):
+        """
+        Convert a single or a list of urls into the corresponding `np.ndarray` objects.
+
+        If a single url is passed, the return value will be a single object. If a list is passed a list of objects is
+        returned.
+        """
+        if sampling_rate is None:
+            sampling_rate = getattr(self, "sample_rate", 16000)
+        if isinstance(audio_url_or_urls, list):
+            return [self.fetch_audio(x, sampling_rate=sampling_rate) for x in audio_url_or_urls]
+        elif isinstance(audio_url_or_urls, str):
+            return load_audio(audio_url_or_urls, sampling_rate=sampling_rate)
+        elif is_valid_audio(audio_url_or_urls):
+            return audio_url_or_urls
+        else:
+            raise TypeError(f"only a single or a list of entries is supported but got type={type(audio_url_or_urls)}")
+
+
+AudioProcessingMixin.push_to_hub = copy_func(AudioProcessingMixin.push_to_hub)
+if AudioProcessingMixin.push_to_hub.__doc__ is not None:
+    AudioProcessingMixin.push_to_hub.__doc__ = AudioProcessingMixin.push_to_hub.__doc__.format(
+        object="audio processor", object_class="AutoFeatureExtractor", object_files="audio processor file"
+    )