Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -176,3 +176,7 @@ tags
# Cursor IDE files
.cursor/
test-results/
src/transformers/models/audio_spectrogram_transformer/audio_processing_audio_spectrogram_transformer.py
.gitignore
tests/test_wav2vec2_whisper.py
run_preprocessing_tests.sh
5 changes: 5 additions & 0 deletions src/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,7 @@

_import_structure["utils.dummy_pt_objects"] = [name for name in dir(dummy_pt_objects) if not name.startswith("_")]
else:
_import_structure["audio_processing_backends"] = ["NumpyAudioBackend", "NumpyBackend", "TorchAudioBackend", "TorchBackend"]
_import_structure["model_debugging_utils"] = [
"model_addition_debugger_context",
]
Expand Down Expand Up @@ -477,6 +478,10 @@
if TYPE_CHECKING:
# All modeling imports
# Models
from .audio_processing_backends import NumpyAudioBackend as NumpyAudioBackend
from .audio_processing_backends import NumpyBackend as NumpyBackend
from .audio_processing_backends import TorchAudioBackend as TorchAudioBackend
from .audio_processing_backends import TorchBackend as TorchBackend
from .backbone_utils import BackboneConfigMixin, BackboneMixin
from .cache_utils import Cache as Cache
from .cache_utils import DynamicCache as DynamicCache
Expand Down
702 changes: 702 additions & 0 deletions src/transformers/audio_processing_backends.py

Large diffs are not rendered by default.

148 changes: 148 additions & 0 deletions src/transformers/audio_processing_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
# Copyright 2025 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import warnings
from typing import Any, ClassVar, TypeVar

from .audio_utils import is_valid_audio, load_audio
from .feature_extraction_utils import BatchFeature as BaseBatchFeature
from .preprocessing_base import PreprocessingMixin
from .utils import (
FEATURE_EXTRACTOR_NAME,
copy_func,
logging,
)


_LEGACY_KEY_MAP = {
"input_features": "audio_features",
"input_values": "audio_values",
"audio_input_features": "audio_features",
}


AudioProcessorType = TypeVar("AudioProcessorType", bound="AudioProcessingMixin")


logger = logging.get_logger(__name__)


class BatchFeature(BaseBatchFeature):
r"""
Holds the output of the audio processor specific `__call__` methods.

This class is derived from a python dictionary and can be used as a dictionary.

Args:
data (`dict`):
Dictionary of lists/arrays/tensors returned by the __call__ method ('input_values', 'input_features', etc.).
tensor_type (`Union[None, str, TensorType]`, *optional*):
You can give a tensor_type here to convert the lists of integers in PyTorch/Numpy Tensors at
initialization.
"""

_warned_keys: ClassVar[set] = set()

def __getitem__(self, item):
if isinstance(item, str) and item not in self.data:
new_key = self._resolve_legacy_key(item)
if new_key is not None and new_key in self.data:
if item not in BatchFeature._warned_keys:
warnings.warn(
f"Accessing '{item}' is deprecated, use '{new_key}' instead.",
FutureWarning,
stacklevel=2,
)
BatchFeature._warned_keys.add(item)
return self.data[new_key]
return super().__getitem__(item)

def __contains__(self, item):
if item in self.data:
return True
new_key = self._resolve_legacy_key(item)
return new_key is not None and new_key in self.data

def _resolve_legacy_key(self, old_key):
if old_key in ("attention_mask", "padding_mask"):
if "audio_features_mask" in self.data:
return "audio_features_mask"
if "audio_values_mask" in self.data:
return "audio_values_mask"
return None
return _LEGACY_KEY_MAP.get(old_key)


class AudioProcessingMixin(PreprocessingMixin):
"""
This is an audio processor mixin used to provide saving/loading functionality for audio processors.
"""

_config_name = FEATURE_EXTRACTOR_NAME
_type_key = "audio_processor_type"
_nested_config_keys = ["audio_processor", "feature_extractor"]
_auto_class_default = "AutoFeatureExtractor"
_file_type_label = "audio processor"
_excluded_dict_keys = {"mel_filters", "window"}
_extra_init_pops = ["feature_extractor_type"]
_config_filename_kwarg = "audio_processor_filename"
_subfolder_default = ""

@classmethod
def get_audio_processor_dict(
cls, pretrained_model_name_or_path: str | os.PathLike, **kwargs
) -> tuple[dict[str, Any], dict[str, Any]]:
"""
From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating an
audio processor of type [`~audio_processing_base.AudioProcessingMixin`] using `from_dict`.

Parameters:
pretrained_model_name_or_path (`str` or `os.PathLike`):
The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
subfolder (`str`, *optional*, defaults to `""`):
In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
specify the folder name here.
audio_processor_filename (`str`, *optional*, defaults to `"preprocessor_config.json"`):
The name of the file in the model directory to use for the audio processor config.

Returns:
`tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the audio processor object.
"""
return cls._get_config_dict(pretrained_model_name_or_path, **kwargs)

def fetch_audio(self, audio_url_or_urls: str | list[str] | list[list[str]], sampling_rate: int | None = None):
"""
Convert a single or a list of urls into the corresponding `np.ndarray` objects.

If a single url is passed, the return value will be a single object. If a list is passed a list of objects is
returned.
"""
if sampling_rate is None:
sampling_rate = getattr(self, "sample_rate", 16000)
if isinstance(audio_url_or_urls, list):
return [self.fetch_audio(x, sampling_rate=sampling_rate) for x in audio_url_or_urls]
elif isinstance(audio_url_or_urls, str):
return load_audio(audio_url_or_urls, sampling_rate=sampling_rate)
elif is_valid_audio(audio_url_or_urls):
return audio_url_or_urls
else:
raise TypeError(f"only a single or a list of entries is supported but got type={type(audio_url_or_urls)}")


AudioProcessingMixin.push_to_hub = copy_func(AudioProcessingMixin.push_to_hub)
if AudioProcessingMixin.push_to_hub.__doc__ is not None:
AudioProcessingMixin.push_to_hub.__doc__ = AudioProcessingMixin.push_to_hub.__doc__.format(
object="audio processor", object_class="AutoFeatureExtractor", object_files="audio processor file"
)
Loading
Loading