diff --git a/.gitignore b/.gitignore
index 75f5a9998310..6ed5479ab0c4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -176,3 +176,7 @@ tags
 # Cursor IDE files
 .cursor/
 test-results/
+src/transformers/models/audio_spectrogram_transformer/audio_processing_audio_spectrogram_transformer.py
+.gitignore
+tests/test_wav2vec2_whisper.py
+run_preprocessing_tests.sh
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index c66b077cac36..82b20dc39684 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -355,6 +355,7 @@
 
     _import_structure["utils.dummy_pt_objects"] = [name for name in dir(dummy_pt_objects) if not name.startswith("_")]
 else:
+    _import_structure["audio_processing_backends"] = ["NumpyAudioBackend", "NumpyBackend", "TorchAudioBackend", "TorchBackend"]
     _import_structure["model_debugging_utils"] = [
         "model_addition_debugger_context",
     ]
@@ -477,6 +478,10 @@
 if TYPE_CHECKING:
     # All modeling imports
     # Models
+    from .audio_processing_backends import NumpyAudioBackend as NumpyAudioBackend
+    from .audio_processing_backends import NumpyBackend as NumpyBackend
+    from .audio_processing_backends import TorchAudioBackend as TorchAudioBackend
+    from .audio_processing_backends import TorchBackend as TorchBackend
     from .backbone_utils import BackboneConfigMixin, BackboneMixin
     from .cache_utils import Cache as Cache
     from .cache_utils import DynamicCache as DynamicCache
diff --git a/src/transformers/audio_processing_backends.py b/src/transformers/audio_processing_backends.py
new file mode 100644
index 000000000000..20a4c8a1f4c8
--- /dev/null
+++ b/src/transformers/audio_processing_backends.py
@@ -0,0 +1,702 @@
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import numpy as np
+
+from .audio_processing_utils import BaseAudioProcessor
+from .audio_utils import SpectrogramConfig, amplitude_to_db, mel_filter_bank, power_to_db
+from .utils import PaddingStrategy, is_torch_available, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_torch_available():
+    import torch
+
+
+# ── Torch frequency conversion utilities (used by TorchAudioBackend._mel_filter_bank) ──
+
+
+def _torch_hertz_to_mel_scalar(freq: float, mel_scale: str = "htk") -> float:
+    if mel_scale == "htk":
+        return 2595.0 * math.log10(1.0 + freq / 700.0)
+    elif mel_scale == "kaldi":
+        return 1127.0 * math.log(1.0 + freq / 700.0)
+    f_sp = 200.0 / 3
+    min_log_hz = 1000.0
+    min_log_mel = (min_log_hz - 0.0) / f_sp
+    logstep = math.log(6.4) / 27.0
+    if freq >= min_log_hz:
+        return min_log_mel + math.log(freq / min_log_hz) / logstep
+    return (freq - 0.0) / f_sp
+
+
+def _torch_hertz_to_mel(freq: "torch.Tensor", mel_scale: str = "htk") -> "torch.Tensor":
+    if mel_scale == "htk":
+        return 2595.0 * torch.log10(1.0 + freq / 700.0)
+    elif mel_scale == "kaldi":
+        return 1127.0 * torch.log(1.0 + freq / 700.0)
+    f_sp = 200.0 / 3
+    min_log_hertz = 1000.0
+    min_log_mel = min_log_hertz / f_sp
+    logstep = 27.0 / torch.log(torch.tensor(6.4))
+    mels = freq / f_sp
+    log_region = freq >= min_log_hertz
+    mels[log_region] = min_log_mel + torch.log(freq[log_region] / min_log_hertz) * logstep
+    return mels
+
+
+def _torch_mel_to_hertz(mels: "torch.Tensor", mel_scale: str = "htk") -> "torch.Tensor":
+    if mel_scale == "htk":
+        return 700.0 * (10.0 ** (mels / 2595.0) - 1.0)
+    elif mel_scale == "kaldi":
+        return 700.0 * (torch.exp(mels / 1127.0) - 1.0)
+    f_sp = 200.0 / 3
+    min_log_hz = 1000.0
+    min_log_mel = (min_log_hz - 0.0) / f_sp
+    logstep = math.log(6.4) / 27.0
+    freq = 0.0 + f_sp * mels
+    log_region = mels >= min_log_mel
+    freq[log_region] = min_log_hz * torch.exp(logstep * (mels[log_region] - min_log_mel))
+    return freq
+
+
+def _torch_triangular_filter_bank(fft_freqs, filter_freqs, computation_dtype=None):
+    """Compute triangular mel filter bank (shared by non-kaldi TorchAudioBackend paths)."""
+    num_mel_filters = len(filter_freqs) - 2
+    filter_diff = filter_freqs[1:] - filter_freqs[:-1]
+    slopes = filter_freqs.unsqueeze(0) - fft_freqs.unsqueeze(1)
+    down_slopes = -slopes[:, :-2] / filter_diff[:-1]
+    up_slopes = slopes[:, 2:] / filter_diff[1:]
+    zero = torch.zeros(1, dtype=computation_dtype) if computation_dtype else torch.zeros(1)
+    return torch.clamp(torch.minimum(down_slopes, up_slopes), min=0)
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# NumpyAudioBackend
+# ═══════════════════════════════════════════════════════════════════════════════
+
+
+class NumpyAudioBackend(BaseAudioProcessor):
+    """NumPy backend for portable CPU-only audio processing."""
+
+    @property
+    def backend(self) -> str:
+        return "numpy"
+
+    # ── Audio input processing ────────────────────────────────────────────
+
+    def _process_audio(self, audio_el):
+        if not isinstance(audio_el, np.ndarray):
+            audio_el = np.asarray(audio_el)
+        if audio_el.ndim > 1:
+            if self.force_mono and audio_el.shape[0] > 1:
+                audio_el = audio_el.mean(axis=0)
+            elif audio_el.shape[0] == 1:
+                audio_el = np.squeeze(audio_el, axis=0)
+            else:
+                raise ValueError("Audio has more than one channel but force_mono is False")
+        return audio_el
+
+    # ── Padding & batching ────────────────────────────────────────────────
+
+    def _pad_single(self, audio: np.ndarray, max_length: int) -> np.ndarray:
+        current_length = audio.shape[-1]
+        if current_length >= max_length:
+            return audio
+        pad_length = max_length - current_length
+        if self.padding_side == "right":
+            pad_width = [(0, 0)] * (audio.ndim - 1) + [(0, pad_length)]
+        elif self.padding_side == "left":
+            pad_width = [(0, 0)] * (audio.ndim - 1) + [(pad_length, 0)]
+        else:
+            raise ValueError(f"Invalid padding side: {self.padding_side}")
+        return np.pad(audio, pad_width, mode="constant", constant_values=self.padding_value)
+
+    def _to_batch(self, audio):
+        batch = np.stack(audio)
+        if self.add_channel_dim:
+            batch = batch[:, np.newaxis, :]
+        return batch
+
+    def _pad_features(self, features, padding, max_length, truncation, pad_to_multiple_of):
+        padding_strategy = self._get_padding_strategies(padding=padding, max_length=max_length)
+        if truncation and max_length is not None:
+            features = [f[:max_length] for f in features]
+        actual_lengths = [f.shape[0] for f in features]
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = max(actual_lengths)
+            padding_strategy = PaddingStrategy.MAX_LENGTH
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+        if padding_strategy == PaddingStrategy.MAX_LENGTH and max_length is not None:
+            features = [
+                np.pad(f, [(0, max_length - f.shape[0])] + [(0, 0)] * (f.ndim - 1),
+                       mode="constant", constant_values=self.padding_value)
+                if f.shape[0] < max_length else f
+                for f in features
+            ]
+        return features, [(0, length) for length in actual_lengths]
+
+    def _stack_features(self, features):
+        return np.stack(features)
+
+    # ── Masking ───────────────────────────────────────────────────────────
+
+    def _get_mask(self, audio_ranges, padded_length, do_extract_spectrogram, spectrogram_config):
+        use_audio_mask = self.mask_level == "audio"
+        if do_extract_spectrogram and not use_audio_mask:
+            spec_cfg = spectrogram_config or self.spectrogram_config
+            audio_lengths = np.array([end - start for start, end in audio_ranges])
+            features_lengths = self._get_features_lengths(audio_lengths, spec_cfg)
+            n_features = self._get_features_lengths(padded_length, spec_cfg, include_center_frame=True)
+            mask = (np.arange(n_features)[None, :] < features_lengths[:, None]).astype(np.int32)
+            return {"audio_features_mask": mask}
+        mask = np.zeros((len(audio_ranges), padded_length), dtype=np.int32)
+        for i, (start, end) in enumerate(audio_ranges):
+            mask[i, start:end] = 1
+        return {("audio_features_mask" if do_extract_spectrogram else "audio_values_mask"): mask}
+
+    def _get_feature_mask(self, feature_ranges, padded_length):
+        mask = np.zeros((len(feature_ranges), padded_length), dtype=np.int32)
+        for i, (start, end) in enumerate(feature_ranges):
+            mask[i, start:end] = 1
+        return {"audio_features_mask": mask}
+
+    # ── STFT pipeline ─────────────────────────────────────────────────────
+
+    def _create_stft_window(self, win_length, stft_cfg, audio):
+        N = win_length + 1 if stft_cfg.periodic else win_length
+        fac = np.linspace(-np.pi, np.pi, N)
+        name = stft_cfg.window_fn
+        if name in ("hann", "hann_window"):
+            w = 0.5 + 0.5 * np.cos(fac)
+        elif name in ("hamming", "hamming_window"):
+            w = 0.54 + 0.46 * np.cos(fac)
+        elif name == "boxcar":
+            w = np.ones(N)
+        elif name == "povey":
+            w = (0.5 + 0.5 * np.cos(fac)) ** 0.85
+        else:
+            raise ValueError(f"Unknown window function '{name}'")
+        return w[:win_length] if stft_cfg.periodic else w
+
+    def _prepare_window_and_framing(self, window, win_length, n_fft, needs_manual_framing):
+        if needs_manual_framing and win_length < n_fft:
+            return window, win_length
+        if win_length < n_fft:
+            left_pad = (n_fft - win_length) // 2
+            right_pad = n_fft - win_length - left_pad
+            window = np.pad(window, (left_pad, right_pad))
+        return window, n_fft
+
+    @staticmethod
+    def _np_frame(x, frame_length, hop_length):
+        """Create overlapping frames using stride tricks (replaces librosa.util.frame)."""
+        n_frames = 1 + (x.shape[-1] - frame_length) // hop_length
+        strides = x.strides[:-1] + (x.strides[-1] * hop_length, x.strides[-1])
+        shape = x.shape[:-1] + (n_frames, frame_length)
+        return np.lib.stride_tricks.as_strided(x, shape=shape, strides=strides)
+
+    def _frame_waveform(self, waveform, frame_length, hop_length, n_fft, center, pad_mode):
+        squeezed = waveform.ndim == 1
+        if squeezed:
+            waveform = waveform[np.newaxis, :]
+
+        if center:
+            start_k = int(np.ceil(n_fft // 2 / hop_length))
+            tail_k = (waveform.shape[-1] + n_fft // 2 - n_fft) // hop_length + 1
+
+            if tail_k <= start_k:
+                # Short audio: simple center-pad and index-based framing
+                waveform = np.pad(waveform, ((0, 0), (frame_length // 2, frame_length // 2)), mode=pad_mode)
+                num_frames = 1 + (waveform.shape[-1] - frame_length) // hop_length
+                frame_starts = np.arange(num_frames) * hop_length
+                frames = waveform[:, frame_starts[:, np.newaxis] + np.arange(frame_length)]
+            else:
+                # Long audio: split into pre (left-padded), middle (no pad), post (right-padded)
+                # to handle edge effects from center padding correctly
+                padding = [(0, 0) for _ in range(waveform.ndim)]
+
+                padding[-1] = (frame_length // 2, 0)
+                y_pre = np.pad(waveform[..., : (start_k - 1) * hop_length - n_fft // 2 + n_fft + 1], padding, mode=pad_mode)
+                y_frames_pre = self._np_frame(y_pre, frame_length, hop_length)[..., :start_k, :]
+
+                padding[-1] = (0, frame_length // 2)
+                y_post = np.pad(waveform[..., tail_k * hop_length - n_fft // 2 :], padding, mode=pad_mode)
+                y_frames_post = self._np_frame(y_post, frame_length, hop_length)
+
+                start = start_k * hop_length - n_fft // 2
+                y_frames_middle = self._np_frame(np.ascontiguousarray(waveform[..., start:]), frame_length, hop_length)
+
+                num_frames = y_frames_pre.shape[-2] + y_frames_middle.shape[-2] + y_frames_post.shape[-2]
+                frames = np.concatenate([y_frames_pre, y_frames_middle, y_frames_post], axis=-2)
+        else:
+            # Non-centered: simple index-based framing
+            num_frames = 1 + (waveform.shape[-1] - frame_length) // hop_length
+            frame_starts = np.arange(num_frames) * hop_length
+            frames = waveform[:, frame_starts[:, np.newaxis] + np.arange(frame_length)]
+
+        if squeezed:
+            frames = frames.squeeze(0)
+        return frames, num_frames
+
+    def _frame_audio(self, audio, window, frame_length, hop_length, n_fft, stft_cfg):
+        frames, _ = self._frame_waveform(audio, frame_length, hop_length, n_fft, stft_cfg.center, stft_cfg.pad_mode)
+        compute_dtype = np.result_type(audio.dtype, window.dtype)
+        return frames.astype(compute_dtype, copy=False)
+
+    def _apply_frame_processing(self, frames, *, spectrogram_config, **kwargs):
+        if spectrogram_config.remove_dc_offset:
+            frames = frames - frames.mean(axis=-1, keepdims=True)
+        preemphasis = spectrogram_config.preemphasis
+        if preemphasis is not None:
+            preemph_src = preemphasis * frames[..., :-1]
+            frames[..., 1:] = frames[..., 1:] - preemph_src
+            frames[..., 0] = frames[..., 0] * (1 - preemphasis)
+        return frames
+
+    def _window_and_fft(self, frames, window, frame_length, n_fft, stft_cfg, audio_dtype=None):
+        frames = frames * window
+        spec = np.fft.rfft(frames, n=n_fft, axis=-1).astype(np.complex64)
+        if stft_cfg.normalized:
+            spec = spec / np.sqrt(np.sum(window**2)).astype(spec.real.dtype)
+        return np.moveaxis(spec, -1, -2)
+
+    def _native_stft(self, audio, window, frame_length, hop_length, n_fft, stft_cfg):
+        frames, _ = self._frame_waveform(audio, frame_length, hop_length, n_fft, stft_cfg.center, stft_cfg.pad_mode)
+        compute_dtype = np.result_type(audio.dtype, window.dtype)
+        frames = frames.astype(compute_dtype, copy=False) * window
+        spec = np.fft.rfft(frames, n=n_fft, axis=-1).astype(np.complex64)
+        if stft_cfg.normalized:
+            spec = spec / np.sqrt(np.sum(window**2)).astype(spec.real.dtype)
+        return np.moveaxis(spec, -1, -2)
+
+    def _compute_magnitudes(self, stft_out, power, spectrogram_config=None):
+        # computation_dtype signals that upstream FE used float64 magnitudes
+        if spectrogram_config and spectrogram_config.computation_dtype:
+            return np.abs(stft_out, dtype=np.float64) ** power
+        return np.abs(stft_out) ** power
+
+    # ── Mel scale & normalization ─────────────────────────────────────────
+
+    def _mel_filter_bank(self, spectrogram_config: SpectrogramConfig):
+        stft_cfg = spectrogram_config.stft_config
+        mel_cfg = spectrogram_config.mel_scale_config
+        # float32 dtype matches librosa's per-band rounding; computation_dtype keeps float64
+        filter_dtype = None if spectrogram_config.computation_dtype else np.float32
+        return mel_filter_bank(
+            num_frequency_bins=1 + stft_cfg.n_fft // 2,
+            num_mel_filters=mel_cfg.n_mels,
+            min_frequency=mel_cfg.f_min,
+            max_frequency=mel_cfg.f_max if mel_cfg.f_max is not None else self.sample_rate / 2,
+            sampling_rate=self.sample_rate,
+            norm=mel_cfg.norm,
+            mel_scale=mel_cfg.mel_scale,
+            triangularize_in_mel_space=mel_cfg.triangularize_in_mel_space,
+            dtype=filter_dtype,
+        )
+
+    def _apply_mel_scale(self, features, *, spectrogram_config, **kwargs):
+        mel_filters = self.mel_filters.astype(features.dtype, copy=False)
+        if spectrogram_config.mel_scale_config.matmul_order == "features_first":
+            mel_spec = np.matmul(features, mel_filters)
+        else:
+            mel_spec = np.matmul(mel_filters.T, features)
+        return np.maximum(spectrogram_config.mel_floor, mel_spec)
+
+    def _normalize_magnitude(self, features, *, spectrogram_config,
+                             reference=1.0, min_value=1e-10, db_range=None,
+                             dtype=np.float32, **kwargs):
+        log_mel = spectrogram_config.log_mode
+        if log_mel is None:
+            return features.astype(dtype)
+
+        mel_floor = spectrogram_config.mel_floor
+        result = np.maximum(mel_floor, features)
+
+        if log_mel == "log":
+            result = np.log(result).astype(dtype)
+        elif log_mel == "log10":
+            result = np.log10(result).astype(dtype)
+        elif log_mel == "dB":
+            power = spectrogram_config.stft_config.power
+            if power == 1.0:
+                result = amplitude_to_db(result, reference, min_value, db_range).astype(dtype)
+            elif power == 2.0:
+                result = power_to_db(result, reference, min_value, db_range).astype(dtype)
+            else:
+                raise ValueError(f"Cannot use log_mel option 'dB' with power {power}")
+        else:
+            raise ValueError(f"Unknown log_mel option: {log_mel}")
+        return result
+
+    # ── Kaldi fbank helper ────────────────────────────────────────────────
+
+    def _kaldi_fbank(self, waveform, num_mel_bins, sample_frequency=None, **kwargs):
+        """Extract kaldi-compatible fbank features using torchaudio (or fallback to base pipeline).
+
+        Returns numpy array of shape (time, num_mel_bins).
+        """
+        from .utils import is_speech_available
+
+        if sample_frequency is None:
+            sample_frequency = self.sample_rate
+
+        if is_speech_available():
+            import torch
+            import torchaudio.compliance.kaldi as ta_kaldi
+
+            waveform_tensor = torch.from_numpy(np.asarray(waveform)).unsqueeze(0)
+            fbank = ta_kaldi.fbank(waveform_tensor, num_mel_bins=num_mel_bins,
+                                   sample_frequency=sample_frequency, **kwargs)
+            return fbank.numpy()
+
+        waveform = np.squeeze(waveform)
+        features = self.extract_spectrogram([waveform], spectrogram_config=self.spectrogram_config)
+        return features[0].T
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# TorchAudioBackend
+# ═══════════════════════════════════════════════════════════════════════════════
+
+
+class TorchAudioBackend(BaseAudioProcessor):
+    """Torch backend for audio processing."""
+
+    @property
+    def backend(self) -> str:
+        return "torch"
+
+    # ── Audio input processing ────────────────────────────────────────────
+
+    def _process_audio(self, audio_el):
+        if isinstance(audio_el, np.ndarray):
+            audio_el = torch.from_numpy(audio_el)
+        if audio_el.ndim > 1:
+            if self.force_mono and audio_el.shape[0] > 1:
+                audio_el = audio_el.mean(dim=0)
+            elif audio_el.shape[0] == 1:
+                audio_el = audio_el.squeeze(0)
+            else:
+                raise ValueError("Audio has more than one channel but force_mono is False")
+        return audio_el
+
+    # ── Padding & batching ────────────────────────────────────────────────
+
+    def _pad_single(self, audio, max_length):
+        current_length = audio.shape[-1]
+        if current_length >= max_length:
+            return audio
+        if self.padding_side == "right":
+            pad_args = (0, max_length - current_length)
+        elif self.padding_side == "left":
+            pad_args = (max_length - current_length, 0)
+        else:
+            raise ValueError(f"Invalid padding side: {self.padding_side}")
+        return torch.nn.functional.pad(audio, pad_args, "constant", self.padding_value)
+
+    def _to_batch(self, audio):
+        batch = torch.stack(audio)
+        if self.add_channel_dim:
+            batch = batch.unsqueeze(1)
+        return batch
+
+    def _pad_features(self, features, padding, max_length, truncation, pad_to_multiple_of):
+        padding_strategy = self._get_padding_strategies(padding=padding, max_length=max_length)
+        if truncation and max_length is not None:
+            features = [f[:max_length] for f in features]
+        actual_lengths = [f.shape[0] for f in features]
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = max(actual_lengths)
+            padding_strategy = PaddingStrategy.MAX_LENGTH
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+        if padding_strategy == PaddingStrategy.MAX_LENGTH and max_length is not None:
+            padded = []
+            for f in features:
+                if f.shape[0] < max_length:
+                    pad_args = [0, 0] * (f.ndim - 1) + [0, max_length - f.shape[0]]
+                    f = torch.nn.functional.pad(f, pad_args, "constant", self.padding_value)
+                padded.append(f)
+            features = padded
+        return features, [(0, length) for length in actual_lengths]
+
+    def _stack_features(self, features):
+        return torch.stack(features)
+
+    # ── Masking ───────────────────────────────────────────────────────────
+
+    def _get_mask(self, audio_ranges, padded_length, do_extract_spectrogram, spectrogram_config):
+        use_audio_mask = self.mask_level == "audio"
+        if do_extract_spectrogram and not use_audio_mask:
+            spec_cfg = spectrogram_config or self.spectrogram_config
+            audio_lengths = torch.tensor([end - start for start, end in audio_ranges])
+            features_lengths = self._get_features_lengths(audio_lengths, spec_cfg)
+            n_features = self._get_features_lengths(padded_length, spec_cfg, include_center_frame=True)
+            mask = (torch.arange(n_features)[None, :] < features_lengths[:, None]).to(torch.int32)
+            return {"audio_features_mask": mask}
+        mask = torch.zeros((len(audio_ranges), padded_length), dtype=torch.int32)
+        for i, (start, end) in enumerate(audio_ranges):
+            mask[i, start:end] = 1
+        return {("audio_features_mask" if do_extract_spectrogram else "audio_values_mask"): mask}
+
+    def _get_feature_mask(self, feature_ranges, padded_length):
+        mask = torch.zeros((len(feature_ranges), padded_length), dtype=torch.int32)
+        for i, (start, end) in enumerate(feature_ranges):
+            mask[i, start:end] = 1
+        return {"audio_features_mask": mask}
+
+    # ── STFT pipeline ─────────────────────────────────────────────────────
+
+    def _needs_manual_framing(self, spectrogram_config):
+        return super()._needs_manual_framing(spectrogram_config) or spectrogram_config.stft_config.left_align_fft
+
+    def _create_stft_window(self, win_length, stft_cfg, audio):
+        dtype = getattr(torch, stft_cfg.window_dtype) if stft_cfg.window_dtype else audio.dtype
+        wkwargs = {**(stft_cfg.wkwargs or {}), "dtype": dtype}
+        name = stft_cfg.window_fn
+        if name in ("hann", "hann_window"):
+            window = torch.hann_window(win_length, periodic=stft_cfg.periodic, **wkwargs)
+        elif name in ("hamming", "hamming_window"):
+            window = torch.hamming_window(win_length, periodic=stft_cfg.periodic, **wkwargs)
+        elif name == "boxcar":
+            window = torch.ones(win_length)
+        elif name == "povey":
+            window = torch.hann_window(win_length, periodic=stft_cfg.periodic, **wkwargs).pow(0.85)
+        else:
+            raise ValueError(f"Unknown window function '{name}'")
+        return window.to(device=audio.device)
+
+    def _prepare_window_and_framing(self, window, win_length, n_fft, needs_manual_framing):
+        if needs_manual_framing and win_length < n_fft:
+            return window, win_length
+        if win_length < n_fft:
+            left_pad = (n_fft - win_length) // 2
+            right_pad = n_fft - win_length - left_pad
+            window = torch.nn.functional.pad(window, (left_pad, right_pad))
+        return window, n_fft
+
+    def _frame_audio(self, audio, window, frame_length, hop_length, n_fft, stft_cfg):
+        if stft_cfg.center:
+            audio = torch.nn.functional.pad(audio, (frame_length // 2, frame_length // 2), mode=stft_cfg.pad_mode)
+        return audio.unfold(-1, frame_length, hop_length)
+
+    def _apply_frame_processing(self, frames, *, spectrogram_config, **kwargs):
+        if spectrogram_config.remove_dc_offset:
+            frames = frames - frames.mean(dim=-1, keepdim=True)
+        preemphasis = spectrogram_config.preemphasis
+        if preemphasis is not None:
+            frames = torch.cat([
+                frames[..., :1] * (1 - preemphasis),
+                frames[..., 1:] - preemphasis * frames[..., :-1],
+            ], dim=-1)
+        return frames
+
+    def _window_and_fft(self, frames, window, frame_length, n_fft, stft_cfg, audio_dtype=None):
+        frames = frames * window
+        if frame_length < n_fft:
+            frames = torch.nn.functional.pad(frames, (0, n_fft - frame_length))
+        spec = torch.fft.rfft(frames, n=n_fft)
+        if stft_cfg.normalized:
+            spec = spec / window.pow(2.0).sum().sqrt()
+        return spec.transpose(-2, -1)
+
+    def _native_stft(self, audio, window, frame_length, hop_length, n_fft, stft_cfg):
+        stft_out = torch.stft(
+            audio, n_fft=n_fft, hop_length=hop_length, win_length=frame_length,
+            window=window, center=stft_cfg.center, pad_mode=stft_cfg.pad_mode,
+            normalized=False, return_complex=True,
+        )
+        if stft_cfg.normalized:
+            stft_out = stft_out / window.pow(2.0).sum().sqrt()
+        return stft_out
+
+    def _cast_stft_output(self, magnitudes, spectrogram_config):
+        if spectrogram_config.computation_dtype:
+            return magnitudes
+        return magnitudes.float()
+
+    def _compute_magnitudes(self, stft_out, power, spectrogram_config=None):
+        return stft_out.abs() ** power
+
+    # ── Mel scale & normalization ─────────────────────────────────────────
+
+    def _mel_filter_bank(self, spectrogram_config: SpectrogramConfig):
+        stft_cfg = spectrogram_config.stft_config
+        mel_cfg = spectrogram_config.mel_scale_config
+        computation_dtype = getattr(torch, mel_cfg.computation_dtype) if mel_cfg.computation_dtype else None
+        num_frequency_bins = 1 + stft_cfg.n_fft // 2
+        num_mel_filters = mel_cfg.n_mels
+        min_frequency = mel_cfg.f_min
+        max_frequency = mel_cfg.f_max if mel_cfg.f_max is not None else self.sample_rate / 2
+        n_fft = (num_frequency_bins - 1) * 2
+
+        if mel_cfg.triangularize_in_mel_space and mel_cfg.bands_to_zero == 0:
+            # Kaldi-exact path: matches torchaudio.compliance.kaldi.get_mel_banks
+            mel_filters = self._kaldi_exact_mel_banks(
+                num_mel_filters, num_frequency_bins, min_frequency, max_frequency,
+                self.sample_rate, n_fft,
+            )
+        elif mel_cfg.triangularize_in_mel_space:
+            mel_filters = self._kaldi_mel_banks_with_zero_bands(
+                num_mel_filters, num_frequency_bins, min_frequency, max_frequency,
+                self.sample_rate, n_fft, mel_cfg, computation_dtype,
+            )
+        else:
+            mel_filters = self._standard_mel_banks(
+                num_mel_filters, num_frequency_bins, min_frequency, max_frequency,
+                self.sample_rate, n_fft, mel_cfg, computation_dtype,
+            )
+
+        # Cast back when mel computation_dtype doesn't match spectrogram computation_dtype
+        if computation_dtype is not None and not spectrogram_config.computation_dtype:
+            mel_filters = mel_filters.to(torch.get_default_dtype())
+        return mel_filters
+
+    @staticmethod
+    def _kaldi_exact_mel_banks(num_mel_filters, num_frequency_bins, min_frequency,
+                               max_frequency, sampling_rate, n_fft):
+        """Matches torchaudio.compliance.kaldi.get_mel_banks exactly."""
+        num_fft_bins = n_fft // 2
+        fft_bin_width = sampling_rate / n_fft
+        mel_low = 1127.0 * math.log(1.0 + min_frequency / 700.0)
+        mel_high = 1127.0 * math.log(1.0 + max_frequency / 700.0)
+        mel_delta = (mel_high - mel_low) / (num_mel_filters + 1)
+
+        bin_idx = torch.arange(num_mel_filters).unsqueeze(1)
+        left_mel = mel_low + bin_idx * mel_delta
+        center_mel = mel_low + (bin_idx + 1.0) * mel_delta
+        right_mel = mel_low + (bin_idx + 2.0) * mel_delta
+
+        mel = 1127.0 * (1.0 + fft_bin_width * torch.arange(num_fft_bins) / 700.0).log()
+        mel = mel.unsqueeze(0)
+
+        up_slope = (mel - left_mel) / (center_mel - left_mel)
+        down_slope = (right_mel - mel) / (right_mel - center_mel)
+        banks = torch.max(torch.zeros(1), torch.min(up_slope, down_slope))
+        banks = torch.nn.functional.pad(banks, (0, 1), mode="constant", value=0)
+        return banks.T
+
+    @staticmethod
+    def _kaldi_mel_banks_with_zero_bands(num_mel_filters, num_frequency_bins, min_frequency,
+                                         max_frequency, sampling_rate, n_fft, mel_cfg, computation_dtype):
+        """Kaldi-style with bands_to_zero > 0."""
+        mel_min = _torch_hertz_to_mel_scalar(min_frequency, mel_scale=mel_cfg.mel_scale)
+        mel_max = _torch_hertz_to_mel_scalar(max_frequency, mel_scale=mel_cfg.mel_scale)
+        mel_delta = (mel_max - mel_min) / (num_mel_filters + 1)
+        bin_idx = torch.arange(num_mel_filters, dtype=computation_dtype).unsqueeze(1)
+        left_mel = mel_min + bin_idx * mel_delta
+        center_mel = mel_min + (bin_idx + 1.0) * mel_delta
+        right_mel = mel_min + (bin_idx + 2.0) * mel_delta
+
+        fft_bin_width = sampling_rate / n_fft
+        hz_freqs = fft_bin_width * torch.arange(mel_cfg.bands_to_zero, num_frequency_bins, dtype=computation_dtype)
+        mel = _torch_hertz_to_mel(hz_freqs, mel_scale=mel_cfg.mel_scale).unsqueeze(0)
+
+        up_slope = (mel - left_mel) / (center_mel - left_mel)
+        down_slope = (right_mel - mel) / (right_mel - center_mel)
+        zero = torch.zeros(1, dtype=computation_dtype)
+        mel_filters = torch.max(zero, torch.min(up_slope, down_slope)).T
+        if mel_cfg.bands_to_zero > 0:
+            mel_filters = torch.nn.functional.pad(mel_filters, (0, 0, mel_cfg.bands_to_zero, 0))
+        return mel_filters
+
+    @staticmethod
+    def _standard_mel_banks(num_mel_filters, num_frequency_bins, min_frequency,
+                            max_frequency, sampling_rate, n_fft, mel_cfg, computation_dtype):
+        """Standard (non-kaldi) triangular mel filter bank."""
+        mel_min = _torch_hertz_to_mel_scalar(min_frequency, mel_scale=mel_cfg.mel_scale)
+        mel_max = _torch_hertz_to_mel_scalar(max_frequency, mel_scale=mel_cfg.mel_scale)
+        mel_freqs = torch.linspace(mel_min, mel_max, num_mel_filters + 2, dtype=computation_dtype)
+        filter_freqs = _torch_mel_to_hertz(mel_freqs, mel_scale=mel_cfg.mel_scale)
+
+        if mel_cfg.frequency_bin_mode == "rfft":
+            fft_freqs = torch.fft.rfftfreq(n=n_fft, d=1.0 / sampling_rate)
+        else:
+            fft_freqs = torch.linspace(0, sampling_rate // 2, num_frequency_bins)
+        if computation_dtype is not None:
+            fft_freqs = fft_freqs.to(computation_dtype)
+
+        filter_diff = filter_freqs[1:] - filter_freqs[:-1]
+        slopes = filter_freqs.unsqueeze(0) - fft_freqs.unsqueeze(1)
+        down_slopes = -slopes[:, :-2] / filter_diff[:-1]
+        up_slopes = slopes[:, 2:] / filter_diff[1:]
+        mel_filters = torch.clamp(torch.minimum(down_slopes, up_slopes), min=0)
+
+        if mel_cfg.norm == "slaney":
+            enorm = 2.0 / (filter_freqs[2 : num_mel_filters + 2] - filter_freqs[:num_mel_filters])
+            mel_filters = mel_filters * enorm.unsqueeze(0)
+
+        if mel_cfg.bands_to_zero > 0:
+            mel_filters = torch.nn.functional.pad(mel_filters, (0, 0, mel_cfg.bands_to_zero, 0))
+        return mel_filters
+
+    def _apply_mel_scale(self, features, *, spectrogram_config, **kwargs):
+        mel_filters = self.mel_filters.to(device=features.device)
+        if spectrogram_config.mel_scale_config.matmul_order == "features_first":
+            mel_spec = torch.matmul(features.transpose(-2, -1), mel_filters)
+        else:
+            # F.linear matches torchaudio's MelScale implementation exactly
+            mel_spec = torch.nn.functional.linear(features.transpose(-2, -1), mel_filters.T).transpose(-2, -1)
+        return torch.clamp(mel_spec, min=spectrogram_config.mel_floor)
+
+    def _normalize_magnitude(self, features, *, spectrogram_config,
+                             reference=1.0, min_value=1e-10, db_range=None,
+                             dtype=None, **kwargs):
+        log_mel = spectrogram_config.log_mode
+        mel_floor = spectrogram_config.mel_floor
+        power = spectrogram_config.stft_config.power
+        if dtype is None:
+            dtype = torch.float32
+
+        if log_mel is None:
+            return features
+
+        result = torch.clamp(features, min=mel_floor)
+
+        if log_mel == "log":
+            result = torch.log(result).to(dtype)
+        elif log_mel == "log10":
+            result = torch.log10(result).to(dtype)
+        elif log_mel == "dB":
+            if reference <= 0.0:
+                raise ValueError("reference must be greater than zero")
+            if min_value <= 0.0:
+                raise ValueError("min_value must be greater than zero")
+            reference = max(min_value, reference)
+            multiplier = 10.0 if power == 2.0 else 20.0 if power == 1.0 else None
+            if multiplier is None:
+                raise ValueError(f"Cannot use log_mel option 'dB' with power {power}")
+            log_ref = torch.log10(torch.tensor(reference, dtype=result.dtype, device=result.device))
+            result = torch.clamp(result, min=min_value)
+            result = multiplier * (torch.log10(result) - log_ref)
+            if db_range is not None:
+                if db_range <= 0.0:
+                    raise ValueError("db_range must be greater than zero")
+                max_vals = result.amax(dim=-2, keepdim=True) if result.ndim > 2 else result.max()
+                result = torch.clamp(result, min=max_vals - db_range)
+            result = result.to(dtype)
+        else:
+            raise ValueError(f"Unknown log_mel option: {log_mel}")
+
+        if spectrogram_config.skip_last_frame:
+            result = result[..., :-1]
+
+        return result
diff --git a/src/transformers/audio_processing_base.py b/src/transformers/audio_processing_base.py
new file mode 100644
index 000000000000..6fba8be02082
--- /dev/null
+++ b/src/transformers/audio_processing_base.py
@@ -0,0 +1,148 @@
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import warnings
+from typing import Any, ClassVar, TypeVar
+
+from .audio_utils import is_valid_audio, load_audio
+from .feature_extraction_utils import BatchFeature as BaseBatchFeature
+from .preprocessing_base import PreprocessingMixin
+from .utils import (
+    FEATURE_EXTRACTOR_NAME,
+    copy_func,
+    logging,
+)
+
+
+_LEGACY_KEY_MAP = {
+    "input_features": "audio_features",
+    "input_values": "audio_values",
+    "audio_input_features": "audio_features",
+}
+
+
+AudioProcessorType = TypeVar("AudioProcessorType", bound="AudioProcessingMixin")
+
+
+logger = logging.get_logger(__name__)
+
+
+class BatchFeature(BaseBatchFeature):
+    r"""
+    Holds the output of the audio processor specific `__call__` methods.
+
+    This class is derived from a python dictionary and can be used as a dictionary.
+
+    Args:
+        data (`dict`):
+            Dictionary of lists/arrays/tensors returned by the __call__ method ('input_values', 'input_features', etc.).
+        tensor_type (`Union[None, str, TensorType]`, *optional*):
+            You can give a tensor_type here to convert the lists of integers in PyTorch/Numpy Tensors at
+            initialization.
+    """
+
+    _warned_keys: ClassVar[set] = set()
+
+    def __getitem__(self, item):
+        if isinstance(item, str) and item not in self.data:
+            new_key = self._resolve_legacy_key(item)
+            if new_key is not None and new_key in self.data:
+                if item not in BatchFeature._warned_keys:
+                    warnings.warn(
+                        f"Accessing '{item}' is deprecated, use '{new_key}' instead.",
+                        FutureWarning,
+                        stacklevel=2,
+                    )
+                    BatchFeature._warned_keys.add(item)
+                return self.data[new_key]
+        return super().__getitem__(item)
+
+    def __contains__(self, item):
+        if item in self.data:
+            return True
+        new_key = self._resolve_legacy_key(item)
+        return new_key is not None and new_key in self.data
+
+    def _resolve_legacy_key(self, old_key):
+        if old_key in ("attention_mask", "padding_mask"):
+            if "audio_features_mask" in self.data:
+                return "audio_features_mask"
+            if "audio_values_mask" in self.data:
+                return "audio_values_mask"
+            return None
+        return _LEGACY_KEY_MAP.get(old_key)
+
+
+class AudioProcessingMixin(PreprocessingMixin):
+    """
+    This is an audio processor mixin used to provide saving/loading functionality for audio processors.
+    """
+
+    _config_name = FEATURE_EXTRACTOR_NAME
+    _type_key = "audio_processor_type"
+    _nested_config_keys = ["audio_processor", "feature_extractor"]
+    _auto_class_default = "AutoFeatureExtractor"
+    _file_type_label = "audio processor"
+    _excluded_dict_keys = {"mel_filters", "window"}
+    _extra_init_pops = ["feature_extractor_type"]
+    _config_filename_kwarg = "audio_processor_filename"
+    _subfolder_default = ""
+
+    @classmethod
+    def get_audio_processor_dict(
+        cls, pretrained_model_name_or_path: str | os.PathLike, **kwargs
+    ) -> tuple[dict[str, Any], dict[str, Any]]:
+        """
+        From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating an
+        audio processor of type [`~audio_processing_base.AudioProcessingMixin`] using `from_dict`.
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
+            subfolder (`str`, *optional*, defaults to `""`):
+                In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
+                specify the folder name here.
+            audio_processor_filename (`str`, *optional*, defaults to `"preprocessor_config.json"`):
+                The name of the file in the model directory to use for the audio processor config.
+
+        Returns:
+            `tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the audio processor object.
+        """
+        return cls._get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+    def fetch_audio(self, audio_url_or_urls: str | list[str] | list[list[str]], sampling_rate: int | None = None):
+        """
+        Convert a single or a list of urls into the corresponding `np.ndarray` objects.
+
+        If a single url is passed, the return value will be a single object. If a list is passed a list of objects is
+        returned.
+        """
+        if sampling_rate is None:
+            sampling_rate = getattr(self, "sample_rate", 16000)
+        if isinstance(audio_url_or_urls, list):
+            return [self.fetch_audio(x, sampling_rate=sampling_rate) for x in audio_url_or_urls]
+        elif isinstance(audio_url_or_urls, str):
+            return load_audio(audio_url_or_urls, sampling_rate=sampling_rate)
+        elif is_valid_audio(audio_url_or_urls):
+            return audio_url_or_urls
+        else:
+            raise TypeError(f"only a single or a list of entries is supported but got type={type(audio_url_or_urls)}")
+
+
+AudioProcessingMixin.push_to_hub = copy_func(AudioProcessingMixin.push_to_hub)
+if AudioProcessingMixin.push_to_hub.__doc__ is not None:
+    AudioProcessingMixin.push_to_hub.__doc__ = AudioProcessingMixin.push_to_hub.__doc__.format(
+        object="audio processor", object_class="AutoFeatureExtractor", object_files="audio processor file"
+    )
diff --git a/src/transformers/audio_processing_utils.py b/src/transformers/audio_processing_utils.py
new file mode 100644
index 000000000000..33f4eaff143d
--- /dev/null
+++ b/src/transformers/audio_processing_utils.py
@@ -0,0 +1,633 @@
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import fields, replace
+from typing import Unpack
+
+import numpy as np
+from huggingface_hub.dataclasses import validate_typed_dict
+
+from .audio_processing_base import AudioProcessingMixin
+from .audio_utils import AudioInput, SpectrogramConfig, make_list_of_audio
+from .feature_extraction_utils import BatchFeature
+from .tokenization_utils_base import PaddingStrategy, TruncationStrategy
+from .processing_utils import AudioKwargs
+from .utils import PaddingStrategy, TensorType, logging
+
+from typing import TypedDict
+
+
+logger = logging.get_logger(__name__)
+
+
+class AudioKwargs(TypedDict, total=False):
+    sampling_rate: int | None
+    spectrogram_config: dict | SpectrogramConfig | None
+    do_extract_spectrogram: bool | None
+    do_resample: bool | None
+    return_tensors: str | TensorType | None
+    padding: bool | str | PaddingStrategy | None
+    max_length: int | None
+    truncation: bool | str | TruncationStrategy | None
+    pad_to_multiple_of: int | None
+
+
+class BaseAudioProcessor(AudioProcessingMixin):
+    model_input_names = ["audio"]
+    valid_kwargs = AudioKwargs
+    unused_kwargs = None
+
+    # global defaults
+    sample_rate: int = None
+    force_mono: bool = None
+    add_channel_dim: bool = False
+
+    # padding defaults
+    padding = True
+    padding_side = "right"
+    padding_value = 0.0
+    max_length = None
+    truncation = None
+    pad_to_multiple_of = None
+
+    return_padding_mask = True
+    mask_level = None  # None = auto (features for spectrogram, audio for raw), "audio" = always audio-level
+    spectrogram_config = None
+    do_extract_spectrogram = None
+
+    def __init__(
+        self,
+        sample_rate: int | None = None,
+        force_mono: bool | None = None,
+        **kwargs,
+    ):
+        if sample_rate is not None:
+            self.sample_rate = sample_rate
+        if self.sample_rate is None:
+            raise ValueError(
+                f"`sample_rate` must be set either as a class attribute on {self.__class__.__name__} "
+                "or passed to __init__."
+            )
+
+        if force_mono is not None:
+            self.force_mono = force_mono
+        if self.force_mono is None:
+            raise ValueError(
+                f"`force_mono` must be set either as a class attribute on {self.__class__.__name__} "
+                "or passed to __init__."
+            )
+
+        super().__init__(**kwargs)
+
+        # Standardize init attributes (coerce dicts to config dataclasses)
+        attributes = {key: getattr(self, key) for key in self._valid_kwargs_names}
+        attributes = self._standardize_kwargs(**attributes)
+        for key, value in attributes.items():
+            setattr(self, key, value)
+
+        # Pre-compute mel filters from spectrogram_config
+        if self.spectrogram_config is not None:
+            if self.spectrogram_config.mel_scale_config is not None and not hasattr(self, "mel_filters"):
+                self.mel_filters = self._mel_filter_bank(self.spectrogram_config)
+        self._cached_stft_window = None
+
+    def __call__(self, audio: AudioInput, *args, **kwargs: Unpack[AudioKwargs]) -> BatchFeature:
+        return self.preprocess(audio, *args, **kwargs)
+
+    def preprocess(self, audio: AudioInput, *args, **kwargs: Unpack[AudioKwargs]) -> BatchFeature:
+        """
+        Preprocess an audio or a batch of audio.
+        """
+        # Perform type validation on received kwargs
+        validate_typed_dict(self.valid_kwargs, kwargs)
+
+        # Set default kwargs from self.
+        for kwarg_name in self._valid_kwargs_names:
+            kwargs.setdefault(kwarg_name, getattr(self, kwarg_name, None))
+
+        # Standardize kwargs (coerce dicts to config dataclasses)
+        kwargs = self._standardize_kwargs(**kwargs)
+
+        # Validate kwargs
+        self._validate_preprocess_kwargs(**kwargs)
+
+        return self._preprocess_audio_like_inputs(audio, *args, **kwargs)
+
+    def _preprocess_audio_like_inputs(
+        self,
+        audio: AudioInput,
+        *args,
+        sample_rate: int | None = None,
+        **kwargs: Unpack[AudioKwargs],
+    ) -> BatchFeature:
+        audio = self._prepare_audio_like_inputs(audio=audio, sample_rate=sample_rate)
+        return self._preprocess(audio, *args, **kwargs)
+
+    def _to_batch(self, audio):
+        """Stack a list of audio arrays/tensors into a batch. Implemented by backend subclasses."""
+        raise NotImplementedError
+
+    def _get_mask(self, audio_ranges, padded_length, do_extract_spectrogram, spectrogram_config):
+        """Build attention mask dict from audio_ranges. Returns a dict of {key: mask} to merge into output.
+        Implemented by backend subclasses."""
+        raise NotImplementedError
+
+    def _preprocess(
+        self,
+        audio,
+        padding,
+        max_length,
+        truncation,
+        pad_to_multiple_of,
+        return_tensors,
+        spectrogram_config=None,
+        do_extract_spectrogram=None,
+        do_batch_spectrogram=None,
+        **kwargs,
+    ) -> BatchFeature:
+        if do_batch_spectrogram is None:
+            do_batch_spectrogram = getattr(self, "do_batch_spectrogram", True)
+        if do_extract_spectrogram and not do_batch_spectrogram:
+            # Per-waveform extraction path: extract → postprocess → pad features → mask
+            features = self.extract_spectrogram(audio, spectrogram_config=spectrogram_config, **kwargs)
+            feature_lengths = [f.shape[0] for f in features]
+            features = self._postprocess_features(features, feature_lengths)
+            features, feature_ranges = self._pad_features(
+                features, padding, max_length, truncation, pad_to_multiple_of
+            )
+            output = {"audio_features": self._stack_features(features)}
+            if self.return_padding_mask:
+                padded_length = features[0].shape[0]
+                output.update(self._get_feature_mask(feature_ranges, padded_length))
+            output = self._postprocess_output(output, feature_ranges=feature_ranges, **kwargs)
+        else:
+            # Standard path: pad audio → optionally batch → extract/passthrough
+            audio, audio_ranges = self.pad(audio, padding, max_length, truncation, pad_to_multiple_of)
+            padded_length = audio[0].shape[-1]
+
+            if do_extract_spectrogram:
+                audio = self._to_batch(audio) if do_batch_spectrogram else audio
+                feature = self.extract_spectrogram(audio, spectrogram_config=spectrogram_config, audio_ranges=audio_ranges, **kwargs)
+                output = {"audio_features": feature}
+            else:
+                output = {"audio_values": self._to_batch(audio)}
+
+            if self.return_padding_mask:
+                output.update(self._get_mask(
+                    audio_ranges, padded_length, do_extract_spectrogram=do_extract_spectrogram, spectrogram_config=spectrogram_config
+                ))
+            output = self._postprocess_output(output, audio_ranges=audio_ranges, **kwargs)
+
+        return BatchFeature(data=output, tensor_type=return_tensors)
+
+    def _postprocess_features(self, features, feature_lengths):
+        """Hook: per-utterance feature processing after extraction, before feature-level padding.
+
+        Override for normalization that must happen on unpadded features
+        (e.g., SeamlessM4t mean/variance normalization).
+        """
+        return features
+
+    def _postprocess_output(self, output, audio_ranges=None, feature_ranges=None, **kwargs):
+        """Hook: augment or modify the output dict after main processing.
+
+        Override to add custom fields (e.g., audio_embed_sizes) or
+        post-hoc normalization on the stacked/batched output.
+        """
+        return output
+
+    def _pad_features(self, features, padding, max_length, truncation, pad_to_multiple_of):
+        """Pad a list of 2D feature arrays along the time axis (axis 0).
+        Implemented by backend subclasses."""
+        raise NotImplementedError
+
+    def _stack_features(self, features):
+        """Stack a list of feature arrays/tensors into a batch.
+        Implemented by backend subclasses."""
+        raise NotImplementedError
+
+    def _get_feature_mask(self, feature_ranges, padded_length):
+        """Build attention mask dict from feature_ranges.
+        Implemented by backend subclasses."""
+        raise NotImplementedError
+
+    def _prepare_audio_like_inputs(self, audio: AudioInput, *args, sample_rate: int | None = None, **kwargs) -> list:
+        """
+        Prepare audio-like inputs for processing by structuring and then converting each
+        audio item via `process_audio`.
+
+        Analogous to `_prepare_image_like_inputs` in the image processing pipeline.
+        """
+        audio = self._prepare_audio_structure(audio, sample_rate=sample_rate)
+        audio = [self.process_audio(audio_el) for audio_el in audio]
+        return audio
+
+    def _prepare_audio_structure(self, audio: AudioInput, sample_rate: int | None = None) -> list:
+        """
+        Prepare the audio structure for processing: fetch URL inputs, validate sample rate,
+        and flatten into a list of audio arrays.
+
+        Analogous to `_prepare_images_structure` in the image processing pipeline.
+        """
+        is_url_input = isinstance(audio, str) or (
+            isinstance(audio, (list, tuple)) and all(isinstance(el, str) for el in audio)
+        )
+
+        if is_url_input:
+            # URL inputs: load directly at the correct sample rate
+            audio = self.fetch_audio(audio)
+        else:
+            # Array inputs: validate that the user-provided sample rate matches the model's
+            if sample_rate is not None:
+                if sample_rate != self.sample_rate:
+                    raise ValueError(
+                        f"The model corresponding to this audio processor: {self.__class__.__name__} was trained using a"
+                        f" sample rate of {self.sample_rate}. Please make sure that the provided `audio` input"
+                        f" was sampled with {self.sample_rate} and not {sample_rate}."
+                    )
+            else:
+                logger.warning(
+                    f"It is strongly recommended to pass the `sample_rate` argument to `{self.__class__.__name__}()`. "
+                    "Failing to do so can result in silent errors that might be hard to debug."
+                )
+
+        audio = make_list_of_audio(audio)
+        return audio
+
+    def _process_audio(self, *args, **kwargs):
+        """
+        Process a single raw audio input into the backend's working format.
+
+        Implemented by backend subclasses (e.g., `TorchAudioBackend`). Converts a raw input
+        (NumPy array) to the backend's internal format (e.g., `torch.Tensor`), handles
+        mono conversion if needed.
+        """
+        raise NotImplementedError
+
+    def process_audio(self, *args, **kwargs):
+        return self._process_audio(*args, **kwargs)
+
+    def pad(
+        self,
+        audio: AudioInput, # TODO: this type makes it unclear to know the have an iterable
+        padding: bool | str | PaddingStrategy = True,
+        max_length: int | None = None,
+        truncation: bool = False,
+        pad_to_multiple_of: int | None = None,
+    ) -> tuple[list, list[tuple[int, int]]]:
+        padding_strategy = self._get_padding_strategies(padding=padding, max_length=max_length)
+
+        if truncation:
+            if max_length is None:
+                # TODO: maybe this check should happen in the _validate_preprocess_kwargs method
+                raise ValueError("When setting `truncation=True`, make sure that `max_length` is defined.")
+            trunc_length = max_length
+            if pad_to_multiple_of is not None and (trunc_length % pad_to_multiple_of != 0):
+                trunc_length = ((trunc_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+            audio = [self._truncate_single(audio_el, max_length=trunc_length) for audio_el in audio]
+
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = max(audio_el.shape[-1] for audio_el in audio)
+            padding_strategy = PaddingStrategy.MAX_LENGTH
+
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+
+        actual_lengths = [audio_el.shape[-1] for audio_el in audio]
+
+        if padding_strategy != PaddingStrategy.DO_NOT_PAD:
+            audio = [self._pad_single(audio_el, max_length=max_length) for audio_el in audio]
+
+        audio_ranges = []
+        for i, length in enumerate(actual_lengths):
+            padded_length = audio[i].shape[-1]
+            if self.padding_side == "left":
+                audio_ranges.append((padded_length - length, padded_length))
+            else:
+                audio_ranges.append((0, length))
+
+        return audio, audio_ranges
+
+    def _truncate_single(self, audio_el, max_length: int):
+        """Truncate a single audio element to max_length along the time axis."""
+        if audio_el.shape[-1] > max_length:
+            return audio_el[..., :max_length]
+        return audio_el
+
+    def _pad_single(self, audio, max_length: int) -> AudioInput:
+        """
+        Pad a single input (on left/right) up to predefined length or max length in the batch.
+
+        Implemented by backend subclasses.
+        """
+        raise NotImplementedError
+
+    def extract_spectrogram(self, audio, *, spectrogram_config: SpectrogramConfig | None = None, **kwargs):
+        """
+        Extract spectrogram features from audio.
+
+        Both the numpy and torch backends implement this method in a batched/sequential manner.
+        It is batched by default, but can be set to be sequential.
+        This can extract just a spectrogram or a Mel spectrogram if a mel config is provided.
+
+        Any extra kwargs whose names match ``SpectrogramConfig`` fields will
+        override the corresponding value on the config for this call.
+
+        Note: Models that bypass the base STFT pipeline entirely (e.g., GraniteSpeech
+        using torchaudio.transforms.MelSpectrogram, or MusicgenMelody using chroma
+        features) can set ``do_extract_spectrogram=True`` without providing a
+        ``spectrogram_config``. They must override this method completely.
+        """
+        if spectrogram_config is None:
+            spectrogram_config = self.spectrogram_config
+
+        config_field_names = {f.name for f in fields(SpectrogramConfig)}
+        overrides = {k: kwargs.pop(k) for k in list(kwargs) if k in config_field_names}
+        if overrides:
+            spectrogram_config = replace(spectrogram_config, **overrides)
+
+        if isinstance(audio, list):
+            features = [
+                self._extract_spectrogram(a, spectrogram_config=spectrogram_config, **kwargs)
+                for a in audio
+            ]
+            if spectrogram_config.mel_scale_config is not None:
+                features = [
+                    self._apply_mel_scale(f, spectrogram_config=spectrogram_config, **kwargs)
+                    for f in features
+                ]
+            features = [
+                self._normalize_magnitude(f, spectrogram_config=spectrogram_config, **kwargs)
+                for f in features
+            ]
+        else:
+            features = self._extract_spectrogram(audio, spectrogram_config=spectrogram_config, **kwargs)
+            if spectrogram_config.mel_scale_config is not None:
+                features = self._apply_mel_scale(features, spectrogram_config=spectrogram_config, **kwargs)
+            features = self._normalize_magnitude(features, spectrogram_config=spectrogram_config, **kwargs)
+
+        return features
+
+    # ── Spectrogram extraction pipeline ──────────────────────────────────
+    #
+    # The full feature-extraction pipeline executed by `extract_spectrogram`:
+    #
+    #   1. _extract_spectrogram   (STFT → power/magnitude spectrogram)
+    #      a. _stft                        – orchestrates steps b–g (overridable for fully custom STFTs)
+    #      b.   _needs_manual_framing      – decide framing strategy (hook)
+    #      c.   _create_stft_window        – create the STFT window (backend)
+    #      d.   _prepare_window_and_framing– pad/reshape window, decide frame length (backend)
+    #      e.   manual path (needs_manual_framing=True):
+    #             _frame_audio             – center pad + frame extraction (backend)
+    #             _apply_frame_processing  – per-frame conditioning (hook)
+    #             _window_and_fft          – window + zero-pad + FFT + normalize → complex (backend)
+    #           native path (needs_manual_framing=False):
+    #             _native_stft             – native STFT returning complex output (backend)
+    #      f.   _compute_magnitudes        – complex → real magnitudes (backend, shared by both paths)
+    #      g.   _cast_stft_output          – cast output dtype (hook, no-op by default)
+    #   2. _apply_mel_scale       (mel filterbank projection)
+    #   3. _normalize_magnitude   (log / dB scaling, optional per-utterance norm)
+    #
+    # Backend subclasses (NumpyAudioBackend, TorchAudioBackend) implement the
+    # full pipeline.  Model-specific processors can override individual hooks
+    # (_apply_frame_processing) or the entire _stft when the base STFT path
+    # is insufficient.
+    #
+    # ``audio_ranges`` is passed through as a kwarg from ``_preprocess`` so that
+    # model-specific overrides (e.g., Parakeet waveform-level preemphasis,
+    # Phi4 boundary masking) can access original audio lengths without stashing
+    # state on ``self``.
+
+    def _extract_spectrogram(self, audio, *, spectrogram_config, **kwargs):
+        """Orchestrate the STFT pipeline.
+
+        Runs the sub-steps listed above in order. Override this only when the
+        pipeline ordering itself needs to change. Otherwise, override individual hooks.
+        """
+        return self._stft(audio, spectrogram_config=spectrogram_config, **kwargs)
+
+    def _stft(self, audio, *, spectrogram_config, **kwargs):
+        """Compute the STFT and return a power/magnitude spectrogram.
+
+        Orchestrates the sub-steps listed in the pipeline documentation above.
+        Backend subclasses implement the individual leaf methods; model-specific
+        processors can override this entirely for a fully custom STFT
+        (e.g., Gemma3n's unfold-based STFT with extra-sample framing).
+        """
+        stft_cfg = spectrogram_config.stft_config
+        n_fft = stft_cfg.n_fft
+        win_length = stft_cfg.win_length or n_fft
+        hop_length = stft_cfg.hop_length or win_length // 2
+        needs_manual_framing = self._needs_manual_framing(spectrogram_config)
+
+        if spectrogram_config.computation_dtype:
+            dtype_str = spectrogram_config.computation_dtype
+            if isinstance(audio, np.ndarray):
+                audio = audio.astype(dtype_str)
+            else:
+                import torch
+                audio = audio.to(getattr(torch, dtype_str))
+        if spectrogram_config.waveform_scale is not None:
+            audio = audio * spectrogram_config.waveform_scale
+
+        # Cache window on first call; reuse on subsequent calls with same config
+        if self._cached_stft_window is not None and spectrogram_config is self.spectrogram_config:
+            window, frame_length = self._cached_stft_window
+        else:
+            window = self._create_stft_window(win_length, stft_cfg, audio)
+            window, frame_length = self._prepare_window_and_framing(window, win_length, n_fft, needs_manual_framing)
+            if spectrogram_config is self.spectrogram_config:
+                self._cached_stft_window = (window, frame_length)
+
+        if needs_manual_framing:
+            audio_dtype = audio.dtype
+            frames = self._frame_audio(audio, window, frame_length, hop_length, n_fft, stft_cfg)
+            frames = self._apply_frame_processing(frames, spectrogram_config=spectrogram_config, **kwargs)
+            stft_out = self._window_and_fft(frames, window, frame_length, n_fft, stft_cfg, audio_dtype=audio_dtype)
+        else:
+            stft_out = self._native_stft(audio, window, frame_length, hop_length, n_fft, stft_cfg)
+
+        magnitudes = self._compute_magnitudes(stft_out, stft_cfg.power, spectrogram_config=spectrogram_config)
+        return self._cast_stft_output(magnitudes, spectrogram_config)
+
+    def _create_stft_window(self, win_length, stft_cfg, audio):
+        """Create the STFT window. Implemented by backend subclasses."""
+        raise NotImplementedError
+
+    def _prepare_window_and_framing(self, window, win_length, n_fft, needs_manual_framing):
+        """Pad/reshape window and determine frame length. Implemented by backend subclasses."""
+        raise NotImplementedError
+
+    def _frame_audio(self, audio, window, frame_length, hop_length, n_fft, stft_cfg):
+        """Extract overlapping frames from the audio signal.
+
+        Handles center padding and dtype promotion. Returns frames of shape
+        (..., num_frames, frame_length). Implemented by backend subclasses.
+        """
+        raise NotImplementedError
+
+    def _window_and_fft(self, frames, window, frame_length, n_fft, stft_cfg):
+        """Apply window, zero-pad, FFT, and normalize. Returns complex STFT of shape (..., freq, time).
+        Implemented by backend subclasses."""
+        raise NotImplementedError
+
+    def _native_stft(self, audio, window, frame_length, hop_length, n_fft, stft_cfg):
+        """Native STFT (e.g. torch.stft). Returns complex output. Implemented by backend subclasses."""
+        raise NotImplementedError
+
+    def _compute_magnitudes(self, stft_out, power, spectrogram_config=None):
+        """Convert complex STFT output to a real-valued magnitude spectrogram.
+        Implemented by backend subclasses. Overridable for custom magnitude computation (e.g. Parakeet)."""
+        raise NotImplementedError
+
+    def _cast_stft_output(self, magnitudes, spectrogram_config):
+        """Cast STFT output to the desired output dtype. Default: no-op."""
+        return magnitudes
+
+    def _needs_manual_framing(self, spectrogram_config):
+        """Whether the STFT requires manual framing (unfold-based) instead of a native STFT.
+
+        Manual framing is needed when per-frame processing must happen between
+        frame extraction and windowing (e.g. per-frame preemphasis, DC offset removal,
+        or left-aligned FFT padding).
+
+        Override in model-specific processors that handle preemphasis at the
+        waveform level (in ``_stft``) and don't need per-frame processing.
+        """
+        return (
+            (spectrogram_config.preemphasis is not None)
+            or spectrogram_config.remove_dc_offset
+        )
+
+    def _compute_magnitudes(self, stft_out, power, spectrogram_config=None):
+        """Convert complex STFT output to a real-valued magnitude spectrogram.
+
+        Only used in the non-manual-framing STFT path.  Override for
+        non-standard magnitude computation (e.g. Parakeet's view_as_real path).
+        """
+        raise NotImplementedError
+
+    def _apply_frame_processing(self, frames, *, spectrogram_config, **kwargs):
+        """Hook: per-frame signal conditioning after frame extraction.
+
+        Called after framing, before windowing and FFT. Default backend
+        implementations apply dither, DC-offset removal, and standard
+        preemphasis.
+
+        Override for non-standard frame processing, e.g. HTK-style
+        preemphasis (Gemma3n).
+        """
+        raise NotImplementedError
+
+    def _apply_mel_scale(self, *args, **kwargs):
+        """Apply mel filterbank to spectrogram features."""
+        raise NotImplementedError
+
+    def _normalize_magnitude(self, *args, **kwargs):
+        """Apply magnitude normalization (log, log10, or dB scaling) to spectrogram features."""
+        raise NotImplementedError
+
+    def _mel_filter_bank(self, spectrogram_config: SpectrogramConfig):
+        raise NotImplementedError
+
+    def _get_features_lengths(self, audio_lengths, spectrogram_config, include_center_frame=False):
+        """
+        Convert raw audio sample lengths to the number of feature frames after spectrogram extraction.
+
+        By default returns `audio_lengths // hop_length`, which gives the number of valid (non-padding)
+        feature frames for centered STFT. When `include_center_frame=True` and the STFT uses centering,
+        adds 1 to account for the extra frame produced by centered STFT.
+
+        Override this method in subclasses that use non-standard STFT configurations (e.g., unfold-based
+        or non-centered STFT).
+        """
+        hop_length = spectrogram_config.stft_config.hop_length
+        lengths = audio_lengths // hop_length
+        if include_center_frame and spectrogram_config.stft_config.center:
+            lengths = lengths + 1
+        return lengths
+
+    def _get_padding_strategies(self, padding=False, max_length=None):
+        """Find the correct padding strategy."""
+        if padding is not False:
+            if padding is True:
+                padding_strategy = PaddingStrategy.LONGEST
+            elif not isinstance(padding, PaddingStrategy):
+                padding_strategy = PaddingStrategy(padding)
+            elif isinstance(padding, PaddingStrategy):
+                padding_strategy = padding
+        else:
+            padding_strategy = PaddingStrategy.DO_NOT_PAD
+
+        if max_length is None:
+            if padding_strategy == PaddingStrategy.MAX_LENGTH:
+                raise ValueError(
+                    f"When setting ``padding={PaddingStrategy.MAX_LENGTH}``, make sure that max_length is defined"
+                )
+
+        if padding_strategy != PaddingStrategy.DO_NOT_PAD and (self.padding_value is None):
+            raise ValueError(
+                "Asking to pad but the feature_extractor does not have a padding value. Please select a value to use"
+                " as `padding_value`. For example: `feature_extractor.padding_value = 0.0`."
+            )
+
+        return padding_strategy
+
+    def _standardize_kwargs(
+        self,
+        **kwargs,
+    ) -> dict:
+        """Coerce dict configs to their dataclass form."""
+        if isinstance(kwargs.get("spectrogram_config"), dict):
+            kwargs["spectrogram_config"] = SpectrogramConfig.from_dict(
+                kwargs["spectrogram_config"]
+            )
+        if kwargs.get("spectrogram_config") is not None and kwargs.get("do_extract_spectrogram") is None:
+            kwargs["do_extract_spectrogram"] = True
+        return kwargs
+
+    def _validate_preprocess_kwargs(
+        self,
+        sample_rate: int | None = None,
+        max_length: int | None = None,
+        truncation: bool | None = None,
+        pad_to_multiple_of: int | None = None,
+        return_tensors: str | TensorType | None = None,
+        **kwargs,
+    ):
+        """Validate the kwargs for the preprocess method."""
+        if truncation and max_length is None:
+            raise ValueError(
+                "When setting `truncation=True`, make sure that `max_length` is defined."
+            )
+
+    def to_dict(self):
+        output = super().to_dict()
+        # Serialize config dataclasses to plain dicts for JSON persistence
+        for key in ("spectrogram_config",):
+            if key in output and hasattr(output[key], "to_dict"):
+                output[key] = output[key].to_dict()
+
+        # Filter out None values that are class defaults
+        filtered_dict = {}
+        for key, value in output.items():
+            if value is None:
+                class_default = getattr(type(self), key, "NOT_FOUND")
+                # Keep None if user explicitly set it (class default is non-None)
+                if class_default != "NOT_FOUND" and class_default is not None:
+                    filtered_dict[key] = value
+            else:
+                filtered_dict[key] = value
+
+        return filtered_dict
diff --git a/src/transformers/audio_utils.py b/src/transformers/audio_utils.py
index 85b56634afe7..14e70e12e45c 100644
--- a/src/transformers/audio_utils.py
+++ b/src/transformers/audio_utils.py
@@ -22,6 +22,7 @@
 import os
 import warnings
 from collections.abc import Sequence
+from dataclasses import dataclass, field, fields
 from io import BytesIO
 from typing import TYPE_CHECKING, Any, Union
 
@@ -57,6 +58,124 @@
 AudioInput = Union[np.ndarray, "torch.Tensor", Sequence[np.ndarray], Sequence["torch.Tensor"]]
 
 
+@dataclass(frozen=True)
+class StftConfig:
+    """Configuration for Short-Time Fourier Transform.
+
+    Uses torchaudio parameter naming conventions. See
+    `torchaudio.transforms.MelSpectrogram` for reference.
+    """
+
+    n_fft: int = 400
+    win_length: int | None = None
+    hop_length: int | None = None
+    window_fn: str = "hann_window"
+    wkwargs: dict | None = None
+    power: float = 2.0
+    center: bool = True
+    pad_mode: str = "reflect"
+    normalized: bool = False
+    onesided: bool | None = None
+    periodic: bool = True
+    left_align_fft: bool = False
+    window_dtype: str | None = None
+
+    def to_dict(self) -> dict:
+        return {f.name: getattr(self, f.name) for f in fields(self) if getattr(self, f.name) is not None}
+
+    @classmethod
+    def from_dict(cls, d: dict) -> "StftConfig":
+        valid_keys = {f.name for f in fields(cls)}
+        return cls(**{k: v for k, v in d.items() if k in valid_keys})
+
+
+@dataclass(frozen=True)
+class MelScaleConfig:
+    """Configuration for mel filterbank.
+
+    Uses torchaudio parameter naming conventions. See
+    `torchaudio.transforms.MelSpectrogram` for reference.
+    """
+
+    n_mels: int = 128
+    f_min: float = 0.0
+    f_max: float | None = None
+    mel_scale: str = "htk"
+    norm: str | None = None
+    triangularize_in_mel_space: bool = False
+    frequency_bin_mode: str = "rfft"
+    computation_dtype: str | None = None
+    bands_to_zero: int = 0
+    matmul_order: str = "filters_first"
+
+    def to_dict(self) -> dict:
+        return {f.name: getattr(self, f.name) for f in fields(self) if getattr(self, f.name) is not None}
+
+    @classmethod
+    def from_dict(cls, d: dict) -> "MelScaleConfig":
+        valid_keys = {f.name for f in fields(cls)}
+        return cls(**{k: v for k, v in d.items() if k in valid_keys})
+
+
+@dataclass(frozen=True)
+class SpectrogramConfig:
+    """Configuration for spectrogram extraction, composed of STFT and mel scale sub-configs."""
+
+    stft_config: StftConfig = field(default_factory=StftConfig)
+    mel_scale_config: MelScaleConfig | None = None
+    log_mode: str = "log10"
+    chunk_length: int | None = None
+    preemphasis: float | None = None
+    remove_dc_offset: bool = False
+    mel_floor: float = 1e-10
+    waveform_scale: float | None = None
+    computation_dtype: str | None = None
+    skip_last_frame: bool = False
+
+    def __getitem__(self, key):
+        if hasattr(self, key):
+            return getattr(self, key)
+        raise KeyError(f"Key {key} not found in SpectrogramConfig.")
+
+    def __iter__(self):
+        for f in fields(self):
+            val = getattr(self, f.name)
+            if val is not None:
+                if hasattr(val, "to_dict"):
+                    yield f.name, val.to_dict()
+                else:
+                    yield f.name, val
+
+    def __eq__(self, other):
+        if isinstance(other, dict):
+            return dict(self) == other
+        if isinstance(other, SpectrogramConfig):
+            return tuple(getattr(self, f.name) for f in fields(self)) == tuple(
+                getattr(other, f.name) for f in fields(self)
+            )
+        return NotImplemented
+
+    def to_dict(self) -> dict:
+        return dict(self)
+
+    @classmethod
+    def from_dict(cls, d: dict) -> "SpectrogramConfig":
+        stft_config = StftConfig.from_dict(d["stft_config"]) if "stft_config" in d else StftConfig()
+        mel_scale_config = MelScaleConfig.from_dict(d["mel_scale_config"]) if "mel_scale_config" in d else None
+        return cls(
+            stft_config=stft_config,
+            mel_scale_config=mel_scale_config,
+            log_mode=d.get("log_mode", "log10"),
+            chunk_length=d.get("chunk_length"),
+            preemphasis=d.get("preemphasis"),
+            remove_dc_offset=d.get("remove_dc_offset", False),
+            mel_floor=d.get("mel_floor", 1e-10),
+            waveform_scale=d.get("waveform_scale"),
+            skip_last_frame=d.get("skip_last_frame", False),
+        )
+
+
+
 def load_audio(audio: str | np.ndarray, sampling_rate=16000, timeout=None) -> np.ndarray:
     """
     Loads `audio` to an np.ndarray object.
@@ -282,10 +401,11 @@ def hertz_to_mel(freq: float | np.ndarray, mel_scale: str = "htk") -> float | np
     elif mel_scale == "kaldi":
         return 1127.0 * np.log(1.0 + (freq / 700.0))
 
+    f_sp = 200.0 / 3
     min_log_hertz = 1000.0
-    min_log_mel = 15.0
+    min_log_mel = min_log_hertz / f_sp
     logstep = 27.0 / np.log(6.4)
-    mels = 3.0 * freq / 200.0
+    mels = freq / f_sp
 
     if isinstance(freq, np.ndarray):
         log_region = freq >= min_log_hertz
@@ -318,10 +438,11 @@ def mel_to_hertz(mels: float | np.ndarray, mel_scale: str = "htk") -> float | np
     elif mel_scale == "kaldi":
         return 700.0 * (np.exp(mels / 1127.0) - 1.0)
 
+    f_sp = 200.0 / 3
     min_log_hertz = 1000.0
-    min_log_mel = 15.0
+    min_log_mel = min_log_hertz / f_sp
     logstep = np.log(6.4) / 27.0
-    freq = 200.0 * mels / 3.0
+    freq = f_sp * mels
 
     if isinstance(mels, np.ndarray):
         log_region = mels >= min_log_mel
@@ -459,6 +580,7 @@ def mel_filter_bank(
     norm: str | None = None,
     mel_scale: str = "htk",
     triangularize_in_mel_space: bool = False,
+    dtype: np.dtype | None = None,
 ) -> np.ndarray:
     """
     Creates a frequency bin conversion matrix used to obtain a mel spectrogram. This is called a *mel filter bank*, and
@@ -527,7 +649,20 @@ def mel_filter_bank(
         # frequencies of FFT bins in Hz
         fft_freqs = np.linspace(0, sampling_rate // 2, num_frequency_bins)
 
-    mel_filters = _create_triangular_filter_bank(fft_freqs, filter_freqs)
+    if dtype is not None:
+        # Per-band computation matching librosa's precision path: compute slopes in float64,
+        # cast each band to dtype immediately. This replicates librosa's per-row assignment
+        # to a dtype-initialized array, which produces different rounding than computing all
+        # bands in float64 and casting at the end.
+        filter_diff = np.diff(filter_freqs)
+        ramps = np.subtract.outer(filter_freqs, fft_freqs)  # (num_mel_filters+2, num_frequency_bins)
+        mel_filters = np.zeros((num_frequency_bins, num_mel_filters), dtype=dtype)
+        for i in range(num_mel_filters):
+            lower = -ramps[i] / filter_diff[i]
+            upper = ramps[i + 2] / filter_diff[i + 1]
+            mel_filters[:, i] = np.maximum(0, np.minimum(lower, upper)).astype(dtype)
+    else:
+        mel_filters = _create_triangular_filter_bank(fft_freqs, filter_freqs)
 
     if norm is not None and norm == "slaney":
         # Slaney-style mel is scaled to be approx constant energy per channel
@@ -620,428 +755,6 @@ def window_function(
     return padded_window
 
 
-# Note: This method processes a single waveform. For batch processing, use spectrogram_batch().
-def spectrogram(
-    waveform: np.ndarray,
-    window: np.ndarray,
-    frame_length: int,
-    hop_length: int,
-    fft_length: int | None = None,
-    power: float | None = 1.0,
-    center: bool = True,
-    pad_mode: str = "reflect",
-    onesided: bool = True,
-    dither: float = 0.0,
-    preemphasis: float | None = None,
-    mel_filters: np.ndarray | None = None,
-    mel_floor: float = 1e-10,
-    log_mel: str | None = None,
-    reference: float = 1.0,
-    min_value: float = 1e-10,
-    db_range: float | None = None,
-    remove_dc_offset: bool = False,
-    dtype: np.dtype = np.float32,
-) -> np.ndarray:
-    """
-    Calculates a spectrogram over one waveform using the Short-Time Fourier Transform.
-
-    This function can create the following kinds of spectrograms:
-
-      - amplitude spectrogram (`power = 1.0`)
-      - power spectrogram (`power = 2.0`)
-      - complex-valued spectrogram (`power = None`)
-      - log spectrogram (use `log_mel` argument)
-      - mel spectrogram (provide `mel_filters`)
-      - log-mel spectrogram (provide `mel_filters` and `log_mel`)
-
-    How this works:
-
-      1. The input waveform is split into frames of size `frame_length` that are partially overlapping by `frame_length
-         - hop_length` samples.
-      2. Each frame is multiplied by the window and placed into a buffer of size `fft_length`.
-      3. The DFT is taken of each windowed frame.
-      4. The results are stacked into a spectrogram.
-
-    We make a distinction between the following "blocks" of sample data, each of which may have a different lengths:
-
-      - The analysis frame. This is the size of the time slices that the input waveform is split into.
-      - The window. Each analysis frame is multiplied by the window to avoid spectral leakage.
-      - The FFT input buffer. The length of this determines how many frequency bins are in the spectrogram.
-
-    In this implementation, the window is assumed to be zero-padded to have the same size as the analysis frame. A
-    padded window can be obtained from `window_function()`. The FFT input buffer may be larger than the analysis frame,
-    typically the next power of two.
-
-    Note: This function is not optimized for speed yet. It should be mostly compatible with `librosa.stft` and
-    `torchaudio.functional.transforms.Spectrogram`, although it is more flexible due to the different ways spectrograms
-    can be constructed.
-
-    Args:
-        waveform (`np.ndarray` of shape `(length,)`):
-            The input waveform. This must be a single real-valued, mono waveform.
-        window (`np.ndarray` of shape `(frame_length,)`):
-            The windowing function to apply, including zero-padding if necessary. The actual window length may be
-            shorter than `frame_length`, but we're assuming the array has already been zero-padded.
-        frame_length (`int`):
-            The length of the analysis frames in samples. With librosa this is always equal to `fft_length` but we also
-            allow smaller sizes.
-        hop_length (`int`):
-            The stride between successive analysis frames in samples.
-        fft_length (`int`, *optional*):
-            The size of the FFT buffer in samples. This determines how many frequency bins the spectrogram will have.
-            For optimal speed, this should be a power of two. If `None`, uses `frame_length`.
-        power (`float`, *optional*, defaults to 1.0):
-            If 1.0, returns the amplitude spectrogram. If 2.0, returns the power spectrogram. If `None`, returns
-            complex numbers.
-        center (`bool`, *optional*, defaults to `True`):
-            Whether to pad the waveform so that frame `t` is centered around time `t * hop_length`. If `False`, frame
-            `t` will start at time `t * hop_length`.
-        pad_mode (`str`, *optional*, defaults to `"reflect"`):
-            Padding mode used when `center` is `True`. Possible values are: `"constant"` (pad with zeros), `"edge"`
-            (pad with edge values), `"reflect"` (pads with mirrored values).
-        onesided (`bool`, *optional*, defaults to `True`):
-            If True, only computes the positive frequencies and returns a spectrogram containing `fft_length // 2 + 1`
-            frequency bins. If False, also computes the negative frequencies and returns `fft_length` frequency bins.
-        dither (`float`, *optional*, defaults to 0.0):
-            Adds dithering. In other words, adds a small Gaussian noise to each frame.
-            E.g. use 4.0 to add dithering with a normal distribution centered
-            around 0.0 with standard deviation 4.0, 0.0 means no dithering.
-            Dithering has similar effect as `mel_floor`. It reduces the high log_mel_fbank
-            values for signals with hard-zero sections, when VAD cutoff is present in the signal.
-        preemphasis (`float`, *optional*)
-            Coefficient for a low-pass filter that applies pre-emphasis before the DFT.
-        mel_filters (`np.ndarray` of shape `(num_freq_bins, num_mel_filters)`, *optional*):
-            The mel filter bank. If supplied, applies a this filter bank to create a mel spectrogram.
-        mel_floor (`float`, *optional*, defaults to 1e-10):
-            Minimum value of mel frequency banks.
-        log_mel (`str`, *optional*):
-            How to convert the spectrogram to log scale. Possible options are: `None` (don't convert), `"log"` (take
-            the natural logarithm) `"log10"` (take the base-10 logarithm), `"dB"` (convert to decibels). Can only be
-            used when `power` is not `None`.
-        reference (`float`, *optional*, defaults to 1.0):
-            Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set
-            the loudest part to 0 dB. Must be greater than zero.
-        min_value (`float`, *optional*, defaults to `1e-10`):
-            The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking
-            `log(0)`. For a power spectrogram, the default of `1e-10` corresponds to a minimum of -100 dB. For an
-            amplitude spectrogram, the value `1e-5` corresponds to -100 dB. Must be greater than zero.
-        db_range (`float`, *optional*):
-            Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the
-            peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
-        remove_dc_offset (`bool`, *optional*):
-            Subtract mean from waveform on each frame, applied before pre-emphasis. This should be set to `true` in
-            order to get the same results as `torchaudio.compliance.kaldi.fbank` when computing mel filters.
-        dtype (`np.dtype`, *optional*, defaults to `np.float32`):
-            Data type of the spectrogram tensor. If `power` is None, this argument is ignored and the dtype will be
-            `np.complex64`.
-
-    Returns:
-        `nd.array` containing a spectrogram of shape `(num_frequency_bins, length)` for a regular spectrogram or shape
-        `(num_mel_filters, length)` for a mel spectrogram.
-    """
-    window_length = len(window)
-
-    if fft_length is None:
-        fft_length = frame_length
-
-    if frame_length > fft_length:
-        raise ValueError(f"frame_length ({frame_length}) may not be larger than fft_length ({fft_length})")
-
-    if window_length != frame_length:
-        raise ValueError(f"Length of the window ({window_length}) must equal frame_length ({frame_length})")
-
-    if hop_length <= 0:
-        raise ValueError("hop_length must be greater than zero")
-
-    if waveform.ndim != 1:
-        raise ValueError(f"Input waveform must have only one dimension, shape is {waveform.shape}")
-
-    if np.iscomplexobj(waveform):
-        raise ValueError("Complex-valued input waveforms are not currently supported")
-
-    if power is None and mel_filters is not None:
-        raise ValueError(
-            "You have provided `mel_filters` but `power` is `None`. Mel spectrogram computation is not yet supported for complex-valued spectrogram."
-            "Specify `power` to fix this issue."
-        )
-
-    # center pad the waveform
-    if center:
-        padding = [(int(frame_length // 2), int(frame_length // 2))]
-        waveform = np.pad(waveform, padding, mode=pad_mode)
-
-    # promote to float64, since np.fft uses float64 internally
-    waveform = waveform.astype(np.float64)
-    window = window.astype(np.float64)
-
-    # split waveform into frames of frame_length size
-    num_frames = int(1 + np.floor((waveform.size - frame_length) / hop_length))
-
-    num_frequency_bins = (fft_length // 2) + 1 if onesided else fft_length
-    spectrogram = np.empty((num_frames, num_frequency_bins), dtype=np.complex64)
-
-    # rfft is faster than fft
-    fft_func = np.fft.rfft if onesided else np.fft.fft
-    buffer = np.zeros(fft_length)
-
-    timestep = 0
-    for frame_idx in range(num_frames):
-        buffer[:frame_length] = waveform[timestep : timestep + frame_length]
-
-        if dither != 0.0:
-            buffer[:frame_length] += dither * np.random.randn(frame_length)
-
-        if remove_dc_offset:
-            buffer[:frame_length] = buffer[:frame_length] - buffer[:frame_length].mean()
-
-        if preemphasis is not None:
-            buffer[1:frame_length] -= preemphasis * buffer[: frame_length - 1]
-            buffer[0] *= 1 - preemphasis
-
-        buffer[:frame_length] *= window
-
-        spectrogram[frame_idx] = fft_func(buffer)
-        timestep += hop_length
-
-    # note: ** is much faster than np.power
-    if power is not None:
-        spectrogram = np.abs(spectrogram, dtype=np.float64) ** power
-
-    spectrogram = spectrogram.T
-
-    if mel_filters is not None:
-        spectrogram = np.maximum(mel_floor, np.dot(mel_filters.T, spectrogram))
-
-    if power is not None and log_mel is not None:
-        if log_mel == "log":
-            spectrogram = np.log(spectrogram)
-        elif log_mel == "log10":
-            spectrogram = np.log10(spectrogram)
-        elif log_mel == "dB":
-            if power == 1.0:
-                spectrogram = amplitude_to_db(spectrogram, reference, min_value, db_range)
-            elif power == 2.0:
-                spectrogram = power_to_db(spectrogram, reference, min_value, db_range)
-            else:
-                raise ValueError(f"Cannot use log_mel option '{log_mel}' with power {power}")
-        else:
-            raise ValueError(f"Unknown log_mel option: {log_mel}")
-
-        spectrogram = np.asarray(spectrogram, dtype)
-
-    return spectrogram
-
-
-def spectrogram_batch(
-    waveform_list: list[np.ndarray],
-    window: np.ndarray,
-    frame_length: int,
-    hop_length: int,
-    fft_length: int | None = None,
-    power: float | None = 1.0,
-    center: bool = True,
-    pad_mode: str = "reflect",
-    onesided: bool = True,
-    dither: float = 0.0,
-    preemphasis: float | None = None,
-    mel_filters: np.ndarray | None = None,
-    mel_floor: float = 1e-10,
-    log_mel: str | None = None,
-    reference: float = 1.0,
-    min_value: float = 1e-10,
-    db_range: float | None = None,
-    remove_dc_offset: bool = False,
-    dtype: np.dtype = np.float32,
-) -> list[np.ndarray]:
-    """
-    Calculates spectrograms for a list of waveforms using the Short-Time Fourier Transform, optimized for batch processing.
-    This function extends the capabilities of the `spectrogram` function to handle multiple waveforms efficiently by leveraging broadcasting.
-
-    It supports generating various types of spectrograms:
-
-        - amplitude spectrogram (`power = 1.0`)
-        - power spectrogram (`power = 2.0`)
-        - complex-valued spectrogram (`power = None`)
-        - log spectrogram (use `log_mel` argument)
-        - mel spectrogram (provide `mel_filters`)
-        - log-mel spectrogram (provide `mel_filters` and `log_mel`)
-
-    How this works:
-
-        1. The input waveform is split into frames of size `frame_length` that are partially overlapping by `frame_length
-            - hop_length` samples.
-        2. Each frame is multiplied by the window and placed into a buffer of size `fft_length`.
-        3. The DFT is taken of each windowed frame.
-        4. The results are stacked into a spectrogram.
-
-    We make a distinction between the following "blocks" of sample data, each of which may have a different lengths:
-
-      - The analysis frame. This is the size of the time slices that the input waveform is split into.
-      - The window. Each analysis frame is multiplied by the window to avoid spectral leakage.
-      - The FFT input buffer. The length of this determines how many frequency bins are in the spectrogram.
-
-    In this implementation, the window is assumed to be zero-padded to have the same size as the analysis frame. A
-    padded window can be obtained from `window_function()`. The FFT input buffer may be larger than the analysis frame,
-    typically the next power of two.
-
-    Note: This function is designed for efficient batch processing of multiple waveforms but retains compatibility with individual waveform processing methods like `librosa.stft`.
-
-    Args:
-        waveform_list (`list[np.ndarray]` with arrays of shape `(length,)`):
-            The list of input waveforms, each a single-channel (mono) signal.
-        window (`np.ndarray` of shape `(frame_length,)`):
-            The windowing function to apply, including zero-padding if necessary.
-        frame_length (`int`):
-            The length of each frame for analysis.
-        hop_length (`int`):
-            The step size between successive frames.
-        fft_length (`int`, *optional*):
-            The size of the FFT buffer, defining frequency bin resolution.
-        power (`float`, *optional*, defaults to 1.0):
-            Determines the type of spectrogram: 1.0 for amplitude, 2.0 for power, None for complex.
-        center (`bool`, *optional*, defaults to `True`):
-            Whether to center-pad the waveform frames.
-        pad_mode (`str`, *optional*, defaults to `"reflect"`):
-            The padding strategy when `center` is `True`.
-        onesided (`bool`, *optional*, defaults to `True`):
-            If True, returns a one-sided spectrogram for real input signals.
-        dither (`float`, *optional*, defaults to 0.0):
-            Adds dithering. In other words, adds a small Gaussian noise to each frame.
-            E.g. use 4.0 to add dithering with a normal distribution centered
-            around 0.0 with standard deviation 4.0, 0.0 means no dithering.
-        preemphasis (`float`, *optional*):
-            Applies a pre-emphasis filter to each frame.
-        mel_filters (`np.ndarray`, *optional*):
-            Mel filter bank for converting to mel spectrogram.
-        mel_floor (`float`, *optional*, defaults to 1e-10):
-            Floor value for mel spectrogram to avoid log(0).
-        log_mel (`str`, *optional*):
-            Specifies log scaling strategy; options are None, "log", "log10", "dB".
-        reference (`float`, *optional*, defaults to 1.0):
-            Reference value for dB conversion in log_mel.
-        min_value (`float`, *optional*, defaults to 1e-10):
-            Minimum floor value for log scale conversions.
-        db_range (`float`, *optional*):
-            Dynamic range for dB scale spectrograms.
-        remove_dc_offset (`bool`, *optional*):
-            Whether to remove the DC offset from each frame.
-        dtype (`np.dtype`, *optional*, defaults to `np.float32`):
-            Data type of the output spectrogram.
-
-    Returns:
-        list[`np.ndarray`]: A list of spectrogram arrays, one for each input waveform.
-    """
-    window_length = len(window)
-
-    if fft_length is None:
-        fft_length = frame_length
-
-    if frame_length > fft_length:
-        raise ValueError(f"frame_length ({frame_length}) may not be larger than fft_length ({fft_length})")
-
-    if window_length != frame_length:
-        raise ValueError(f"Length of the window ({window_length}) must equal frame_length ({frame_length})")
-
-    if hop_length <= 0:
-        raise ValueError("hop_length must be greater than zero")
-
-    # Check the dimensions of the waveform , and if waveform is complex
-    for waveform in waveform_list:
-        if waveform.ndim != 1:
-            raise ValueError(f"Input waveform must have only one dimension, shape is {waveform.shape}")
-        if np.iscomplexobj(waveform):
-            raise ValueError("Complex-valued input waveforms are not currently supported")
-    # Center pad the waveform
-    if center:
-        padding = [(int(frame_length // 2), int(frame_length // 2))]
-        waveform_list = [
-            np.pad(
-                waveform,
-                padding,
-                mode=pad_mode,
-            )
-            for waveform in waveform_list
-        ]
-    original_waveform_lengths = [
-        len(waveform) for waveform in waveform_list
-    ]  # these lengths will be used to remove padding later
-
-    # Batch pad the waveform
-    max_length = max(original_waveform_lengths)
-    padded_waveform_batch = np.array(
-        [
-            np.pad(waveform, (0, max_length - len(waveform)), mode="constant", constant_values=0)
-            for waveform in waveform_list
-        ],
-        dtype=dtype,
-    )
-
-    # Promote to float64, since np.fft uses float64 internally
-    padded_waveform_batch = padded_waveform_batch.astype(np.float64)
-    window = window.astype(np.float64)
-
-    # Split waveform into frames of frame_length size
-    num_frames = int(1 + np.floor((padded_waveform_batch.shape[1] - frame_length) / hop_length))
-    # these lengths will be used to remove padding later
-    true_num_frames = [int(1 + np.floor((length - frame_length) / hop_length)) for length in original_waveform_lengths]
-    num_batches = padded_waveform_batch.shape[0]
-
-    num_frequency_bins = (fft_length // 2) + 1 if onesided else fft_length
-    spectrogram = np.empty((num_batches, num_frames, num_frequency_bins), dtype=np.complex64)
-
-    # rfft is faster than fft
-    fft_func = np.fft.rfft if onesided else np.fft.fft
-    buffer = np.zeros((num_batches, fft_length))
-
-    for frame_idx in range(num_frames):
-        timestep = frame_idx * hop_length
-        buffer[:, :frame_length] = padded_waveform_batch[:, timestep : timestep + frame_length]
-
-        if dither != 0.0:
-            buffer[:, :frame_length] += dither * np.random.randn(*buffer[:, :frame_length].shape)
-
-        if remove_dc_offset:
-            buffer[:, :frame_length] -= buffer[:, :frame_length].mean(axis=1, keepdims=True)
-
-        if preemphasis is not None:
-            buffer[:, 1:frame_length] -= preemphasis * buffer[:, : frame_length - 1]
-            buffer[:, 0] *= 1 - preemphasis
-
-        buffer[:, :frame_length] *= window
-
-        spectrogram[:, frame_idx] = fft_func(buffer)
-
-    # Note: ** is much faster than np.power
-    if power is not None:
-        spectrogram = np.abs(spectrogram, dtype=np.float64) ** power
-
-    # Apply mel filters if provided
-    if mel_filters is not None:
-        result = np.tensordot(spectrogram, mel_filters.T, axes=([2], [1]))
-        spectrogram = np.maximum(mel_floor, result)
-
-    # Convert to log scale if specified
-    if power is not None and log_mel is not None:
-        if log_mel == "log":
-            spectrogram = np.log(spectrogram)
-        elif log_mel == "log10":
-            spectrogram = np.log10(spectrogram)
-        elif log_mel == "dB":
-            if power == 1.0:
-                spectrogram = amplitude_to_db_batch(spectrogram, reference, min_value, db_range)
-            elif power == 2.0:
-                spectrogram = power_to_db_batch(spectrogram, reference, min_value, db_range)
-            else:
-                raise ValueError(f"Cannot use log_mel option '{log_mel}' with power {power}")
-        else:
-            raise ValueError(f"Unknown log_mel option: {log_mel}")
-
-        spectrogram = np.asarray(spectrogram, dtype)
-
-    spectrogram_list = [spectrogram[i, : true_num_frames[i], :].T for i in range(len(true_num_frames))]
-
-    return spectrogram_list
-
 
 def power_to_db(
     spectrogram: np.ndarray,
@@ -1094,55 +807,6 @@ def power_to_db(
     return spectrogram
 
 
-def power_to_db_batch(
-    spectrogram: np.ndarray,
-    reference: float = 1.0,
-    min_value: float = 1e-10,
-    db_range: float | None = None,
-) -> np.ndarray:
-    """
-    Converts a batch of power spectrograms to the decibel scale. This computes `10 * log10(spectrogram / reference)`,
-    using basic logarithm properties for numerical stability.
-
-    This function supports batch processing, where each item in the batch is an individual power (mel) spectrogram.
-
-    Args:
-        spectrogram (`np.ndarray`):
-            The input batch of power (mel) spectrograms. Expected shape is (batch_size, *spectrogram_shape).
-            Note that a power spectrogram has the amplitudes squared!
-        reference (`float`, *optional*, defaults to 1.0):
-            Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set
-            the loudest part to 0 dB. Must be greater than zero.
-        min_value (`float`, *optional*, defaults to `1e-10`):
-            The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking
-            `log(0)`. The default of `1e-10` corresponds to a minimum of -100 dB. Must be greater than zero.
-        db_range (`float`, *optional*):
-            Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the
-            peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
-
-    Returns:
-        `np.ndarray`: the batch of spectrograms in decibels
-    """
-    if reference <= 0.0:
-        raise ValueError("reference must be greater than zero")
-    if min_value <= 0.0:
-        raise ValueError("min_value must be greater than zero")
-
-    reference = max(min_value, reference)
-
-    spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None)
-    spectrogram = 10.0 * (np.log10(spectrogram) - np.log10(reference))
-
-    if db_range is not None:
-        if db_range <= 0.0:
-            raise ValueError("db_range must be greater than zero")
-        # Apply db_range clipping per batch item
-        max_values = spectrogram.max(axis=(1, 2), keepdims=True)
-        spectrogram = np.clip(spectrogram, a_min=max_values - db_range, a_max=None)
-
-    return spectrogram
-
-
 def amplitude_to_db(
     spectrogram: np.ndarray,
     reference: float = 1.0,
@@ -1192,46 +856,3 @@ def amplitude_to_db(
     return spectrogram
 
 
-def amplitude_to_db_batch(
-    spectrogram: np.ndarray, reference: float = 1.0, min_value: float = 1e-5, db_range: float | None = None
-) -> np.ndarray:
-    """
-    Converts a batch of amplitude spectrograms to the decibel scale. This computes `20 * log10(spectrogram / reference)`,
-    using basic logarithm properties for numerical stability.
-
-    The function supports batch processing, where each item in the batch is an individual amplitude (mel) spectrogram.
-
-    Args:
-        spectrogram (`np.ndarray`):
-            The input batch of amplitude (mel) spectrograms. Expected shape is (batch_size, *spectrogram_shape).
-        reference (`float`, *optional*, defaults to 1.0):
-            Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set
-            the loudest part to 0 dB. Must be greater than zero.
-        min_value (`float`, *optional*, defaults to `1e-5`):
-            The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking
-            `log(0)`. The default of `1e-5` corresponds to a minimum of -100 dB. Must be greater than zero.
-        db_range (`float`, *optional*):
-            Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the
-            peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
-
-    Returns:
-        `np.ndarray`: the batch of spectrograms in decibels
-    """
-    if reference <= 0.0:
-        raise ValueError("reference must be greater than zero")
-    if min_value <= 0.0:
-        raise ValueError("min_value must be greater than zero")
-
-    reference = max(min_value, reference)
-
-    spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None)
-    spectrogram = 20.0 * (np.log10(spectrogram) - np.log10(reference))
-
-    if db_range is not None:
-        if db_range <= 0.0:
-            raise ValueError("db_range must be greater than zero")
-        # Apply db_range clipping per batch item
-        max_values = spectrogram.max(axis=(1, 2), keepdims=True)
-        spectrogram = np.clip(spectrogram, a_min=max_values - db_range, a_max=None)
-
-    return spectrogram
diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py
index f1b66f752da4..b30a056f7794 100644
--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@@ -15,20 +15,15 @@
 Feature extraction saving/loading class for common feature extractors.
 """
 
-import copy
-import json
 import os
 from collections import UserDict
 from typing import TYPE_CHECKING, Any, TypeVar, Union
 
 import numpy as np
-from huggingface_hub import create_repo, is_offline_mode
 
-from .dynamic_module_utils import custom_object_save
+from .preprocessing_base import PreprocessingMixin
 from .utils import (
     FEATURE_EXTRACTOR_NAME,
-    PROCESSOR_NAME,
-    PushToHubMixin,
     TensorType,
     _is_tensor_or_array_like,
     copy_func,
@@ -38,9 +33,7 @@
     is_torch_dtype,
     logging,
     requires_backends,
-    safe_load_json_file,
 )
-from .utils.hub import cached_file
 
 
 if TYPE_CHECKING:
@@ -263,170 +256,21 @@ def maybe_to(v):
         return self
 
 
-class FeatureExtractionMixin(PushToHubMixin):
+class FeatureExtractionMixin(PreprocessingMixin):
     """
     This is a feature extraction mixin used to provide saving/loading functionality for sequential and audio feature
     extractors.
     """
 
-    _auto_class = None
-
-    def __init__(self, **kwargs):
-        """Set elements of `kwargs` as attributes."""
-        # Pop "processor_class", it should not be saved in feature extractor config
-        kwargs.pop("processor_class", None)
-        # Additional attributes without default values
-        for key, value in kwargs.items():
-            try:
-                setattr(self, key, value)
-            except AttributeError as err:
-                logger.error(f"Can't set {key} with value {value} for {self}")
-                raise err
-
-    @classmethod
-    def from_pretrained(
-        cls: type[SpecificFeatureExtractorType],
-        pretrained_model_name_or_path: str | os.PathLike,
-        cache_dir: str | os.PathLike | None = None,
-        force_download: bool = False,
-        local_files_only: bool = False,
-        token: str | bool | None = None,
-        revision: str = "main",
-        **kwargs,
-    ) -> SpecificFeatureExtractorType:
-        r"""
-        Instantiate a type of [`~feature_extraction_utils.FeatureExtractionMixin`] from a feature extractor, *e.g.* a
-        derived class of [`SequenceFeatureExtractor`].
-
-        Args:
-            pretrained_model_name_or_path (`str` or `os.PathLike`):
-                This can be either:
-
-                - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
-                  huggingface.co.
-                - a path to a *directory* containing a feature extractor file saved using the
-                  [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] method, e.g.,
-                  `./my_model_directory/`.
-                - a path or url to a saved feature extractor JSON *file*, e.g.,
-                  `./my_model_directory/preprocessor_config.json`.
-            cache_dir (`str` or `os.PathLike`, *optional*):
-                Path to a directory in which a downloaded pretrained model feature extractor should be cached if the
-                standard cache should not be used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force to (re-)download the feature extractor files and override the cached versions
-                if they exist.
-            proxies (`dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-            token (`str` or `bool`, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
-                the token generated when running `hf auth login` (stored in `~/.huggingface`).
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
-                identifier allowed by git.
-
-
-                <Tip>
-
-                To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>"`.
-
-                </Tip>
-
-            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
-                If `False`, then this function returns just the final feature extractor object. If `True`, then this
-                functions returns a `Tuple(feature_extractor, unused_kwargs)` where *unused_kwargs* is a dictionary
-                consisting of the key/value pairs whose keys are not feature extractor attributes: i.e., the part of
-                `kwargs` which has not been used to update `feature_extractor` and is otherwise ignored.
-            kwargs (`dict[str, Any]`, *optional*):
-                The values in kwargs of any keys which are feature extractor attributes will be used to override the
-                loaded values. Behavior concerning key/value pairs whose keys are *not* feature extractor attributes is
-                controlled by the `return_unused_kwargs` keyword parameter.
-
-        Returns:
-            A feature extractor of type [`~feature_extraction_utils.FeatureExtractionMixin`].
-
-        Examples:
-
-        ```python
-        # We can't instantiate directly the base class *FeatureExtractionMixin* nor *SequenceFeatureExtractor* so let's show the examples on a
-        # derived class: *Wav2Vec2FeatureExtractor*
-        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
-            "facebook/wav2vec2-base-960h"
-        )  # Download feature_extraction_config from huggingface.co and cache.
-        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
-            "./test/saved_model/"
-        )  # E.g. feature_extractor (or model) was saved using *save_pretrained('./test/saved_model/')*
-        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("./test/saved_model/preprocessor_config.json")
-        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
-            "facebook/wav2vec2-base-960h", return_attention_mask=False, foo=False
-        )
-        assert feature_extractor.return_attention_mask is False
-        feature_extractor, unused_kwargs = Wav2Vec2FeatureExtractor.from_pretrained(
-            "facebook/wav2vec2-base-960h", return_attention_mask=False, foo=False, return_unused_kwargs=True
-        )
-        assert feature_extractor.return_attention_mask is False
-        assert unused_kwargs == {"foo": False}
-        ```"""
-        kwargs["cache_dir"] = cache_dir
-        kwargs["force_download"] = force_download
-        kwargs["local_files_only"] = local_files_only
-        kwargs["revision"] = revision
-
-        if token is not None:
-            kwargs["token"] = token
-
-        feature_extractor_dict, kwargs = cls.get_feature_extractor_dict(pretrained_model_name_or_path, **kwargs)
-
-        return cls.from_dict(feature_extractor_dict, **kwargs)
-
-    def save_pretrained(self, save_directory: str | os.PathLike, push_to_hub: bool = False, **kwargs):
-        """
-        Save a feature_extractor object to the directory `save_directory`, so that it can be re-loaded using the
-        [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] class method.
-
-        Args:
-            save_directory (`str` or `os.PathLike`):
-                Directory where the feature extractor JSON file will be saved (will be created if it does not exist).
-            push_to_hub (`bool`, *optional*, defaults to `False`):
-                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
-                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
-                namespace).
-            kwargs (`dict[str, Any]`, *optional*):
-                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
-        """
-        if os.path.isfile(save_directory):
-            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
-
-        os.makedirs(save_directory, exist_ok=True)
-
-        if push_to_hub:
-            commit_message = kwargs.pop("commit_message", None)
-            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
-            repo_id = create_repo(repo_id, exist_ok=True, **kwargs).repo_id
-            files_timestamps = self._get_files_timestamps(save_directory)
-
-        # If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
-        # loaded from the Hub.
-        if self._auto_class is not None:
-            custom_object_save(self, save_directory, config=self)
-
-        # If we save using the predefined names, we can load using `from_pretrained`
-        output_feature_extractor_file = os.path.join(save_directory, FEATURE_EXTRACTOR_NAME)
-
-        self.to_json_file(output_feature_extractor_file)
-        logger.info(f"Feature extractor saved in {output_feature_extractor_file}")
-
-        if push_to_hub:
-            self._upload_modified_files(
-                save_directory,
-                repo_id,
-                files_timestamps,
-                commit_message=commit_message,
-                token=kwargs.get("token"),
-            )
-
-        return [output_feature_extractor_file]
+    _config_name = FEATURE_EXTRACTOR_NAME
+    _type_key = "feature_extractor_type"
+    _nested_config_keys = ["feature_extractor", "audio_processor"]
+    _auto_class_default = "AutoFeatureExtractor"
+    _file_type_label = "feature extractor"
+    _excluded_dict_keys = {"mel_filters", "window"}
+    _extra_init_pops = []
+    _config_filename_kwarg = None
+    _subfolder_default = None
 
     @classmethod
     def get_feature_extractor_dict(
@@ -443,104 +287,7 @@ def get_feature_extractor_dict(
         Returns:
             `tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the feature extractor object.
         """
-        cache_dir = kwargs.pop("cache_dir", None)
-        force_download = kwargs.pop("force_download", False)
-        proxies = kwargs.pop("proxies", None)
-        subfolder = kwargs.pop("subfolder", None)
-        token = kwargs.pop("token", None)
-        local_files_only = kwargs.pop("local_files_only", False)
-        revision = kwargs.pop("revision", None)
-
-        from_pipeline = kwargs.pop("_from_pipeline", None)
-        from_auto_class = kwargs.pop("_from_auto", False)
-
-        user_agent = {"file_type": "feature extractor", "from_auto_class": from_auto_class}
-        if from_pipeline is not None:
-            user_agent["using_pipeline"] = from_pipeline
-
-        if is_offline_mode() and not local_files_only:
-            logger.info("Offline mode: forcing local_files_only=True")
-            local_files_only = True
-
-        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
-        is_local = os.path.isdir(pretrained_model_name_or_path)
-        if os.path.isdir(pretrained_model_name_or_path):
-            feature_extractor_file = os.path.join(pretrained_model_name_or_path, FEATURE_EXTRACTOR_NAME)
-        if os.path.isfile(pretrained_model_name_or_path):
-            resolved_feature_extractor_file = pretrained_model_name_or_path
-            resolved_processor_file = None
-            is_local = True
-        else:
-            feature_extractor_file = FEATURE_EXTRACTOR_NAME
-            try:
-                # Load from local folder or from cache or download from model Hub and cache
-                resolved_processor_file = cached_file(
-                    pretrained_model_name_or_path,
-                    filename=PROCESSOR_NAME,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    proxies=proxies,
-                    local_files_only=local_files_only,
-                    token=token,
-                    user_agent=user_agent,
-                    revision=revision,
-                    subfolder=subfolder,
-                    _raise_exceptions_for_missing_entries=False,
-                )
-                resolved_feature_extractor_file = cached_file(
-                    pretrained_model_name_or_path,
-                    filename=feature_extractor_file,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    proxies=proxies,
-                    local_files_only=local_files_only,
-                    token=token,
-                    user_agent=user_agent,
-                    revision=revision,
-                    subfolder=subfolder,
-                    _raise_exceptions_for_missing_entries=False,
-                )
-            except OSError:
-                # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
-                # the original exception.
-                raise
-            except Exception:
-                # For any other exception, we throw a generic error.
-                raise OSError(
-                    f"Can't load feature extractor for '{pretrained_model_name_or_path}'. If you were trying to load"
-                    " it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
-                    f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
-                    f" directory containing a {FEATURE_EXTRACTOR_NAME} file"
-                )
-
-        # Load feature_extractor dict. Priority goes as (nested config if found -> image processor config)
-        # We are downloading both configs because almost all models have a `processor_config.json` but
-        # not all of these are nested. We need to check if it was saved recebtly as nested or if it is legacy style
-        feature_extractor_dict = None
-        if resolved_processor_file is not None:
-            processor_dict = safe_load_json_file(resolved_processor_file)
-            if "feature_extractor" in processor_dict or "audio_processor" in processor_dict:
-                feature_extractor_dict = processor_dict.get("feature_extractor", processor_dict.get("audio_processor"))
-
-        if resolved_feature_extractor_file is not None and feature_extractor_dict is None:
-            feature_extractor_dict = safe_load_json_file(resolved_feature_extractor_file)
-
-        if feature_extractor_dict is None:
-            raise OSError(
-                f"Can't load feature extractor for '{pretrained_model_name_or_path}'. If you were trying to load"
-                " it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
-                f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
-                f" directory containing a {feature_extractor_file} file"
-            )
-
-        if is_local:
-            logger.info(f"loading configuration file {resolved_feature_extractor_file}")
-        else:
-            logger.info(
-                f"loading configuration file {feature_extractor_file} from cache at {resolved_feature_extractor_file}"
-            )
-
-        return feature_extractor_dict, kwargs
+        return cls._get_config_dict(pretrained_model_name_or_path, **kwargs)
 
     @classmethod
     def from_dict(
@@ -581,89 +328,6 @@ def from_dict(
         else:
             return feature_extractor
 
-    def to_dict(self) -> dict[str, Any]:
-        """
-        Serializes this instance to a Python dictionary. Returns:
-            `dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["feature_extractor_type"] = self.__class__.__name__
-        if "mel_filters" in output:
-            del output["mel_filters"]
-        if "window" in output:
-            del output["window"]
-        return output
-
-    @classmethod
-    def from_json_file(cls, json_file: str | os.PathLike) -> "FeatureExtractionMixin":
-        """
-        Instantiates a feature extractor of type [`~feature_extraction_utils.FeatureExtractionMixin`] from the path to
-        a JSON file of parameters.
-
-        Args:
-            json_file (`str` or `os.PathLike`):
-                Path to the JSON file containing the parameters.
-
-        Returns:
-            A feature extractor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The feature_extractor
-            object instantiated from that JSON file.
-        """
-        with open(json_file, encoding="utf-8") as reader:
-            text = reader.read()
-        feature_extractor_dict = json.loads(text)
-        return cls(**feature_extractor_dict)
-
-    def to_json_string(self) -> str:
-        """
-        Serializes this instance to a JSON string.
-
-        Returns:
-            `str`: String containing all the attributes that make up this feature_extractor instance in JSON format.
-        """
-        dictionary = self.to_dict()
-
-        for key, value in dictionary.items():
-            if isinstance(value, np.ndarray):
-                dictionary[key] = value.tolist()
-
-        return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"
-
-    def to_json_file(self, json_file_path: str | os.PathLike):
-        """
-        Save this instance to a JSON file.
-
-        Args:
-            json_file_path (`str` or `os.PathLike`):
-                Path to the JSON file in which this feature_extractor instance's parameters will be saved.
-        """
-        with open(json_file_path, "w", encoding="utf-8") as writer:
-            writer.write(self.to_json_string())
-
-    def __repr__(self):
-        return f"{self.__class__.__name__} {self.to_json_string()}"
-
-    @classmethod
-    def register_for_auto_class(cls, auto_class="AutoFeatureExtractor"):
-        """
-        Register this class with a given auto class. This should only be used for custom feature extractors as the ones
-        in the library are already mapped with `AutoFeatureExtractor`.
-
-
-
-        Args:
-            auto_class (`str` or `type`, *optional*, defaults to `"AutoFeatureExtractor"`):
-                The auto class to register this new feature extractor with.
-        """
-        if not isinstance(auto_class, str):
-            auto_class = auto_class.__name__
-
-        import transformers.models.auto as auto_module
-
-        if not hasattr(auto_module, auto_class):
-            raise ValueError(f"{auto_class} is not a valid auto class.")
-
-        cls._auto_class = auto_class
-
 
 FeatureExtractionMixin.push_to_hub = copy_func(FeatureExtractionMixin.push_to_hub)
 if FeatureExtractionMixin.push_to_hub.__doc__ is not None:
diff --git a/src/transformers/image_processing_base.py b/src/transformers/image_processing_base.py
index 72db8fcc9bec..79d2f7bf2aec 100644
--- a/src/transformers/image_processing_base.py
+++ b/src/transformers/image_processing_base.py
@@ -12,26 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
-import json
 import os
 from typing import Any, TypeVar
 
-import numpy as np
-from huggingface_hub import create_repo, is_offline_mode
-
-from .dynamic_module_utils import custom_object_save
 from .feature_extraction_utils import BatchFeature as BaseBatchFeature
 from .image_utils import is_valid_image, load_image
+from .preprocessing_base import PreprocessingMixin
 from .utils import (
     IMAGE_PROCESSOR_NAME,
-    PROCESSOR_NAME,
-    PushToHubMixin,
     copy_func,
     logging,
-    safe_load_json_file,
 )
-from .utils.hub import cached_file
 
 
 ImageProcessorType = TypeVar("ImageProcessorType", bound="ImageProcessingMixin")
@@ -58,175 +49,21 @@ class BatchFeature(BaseBatchFeature):
 
 
 # TODO: (Amy) - factor out the common parts of this and the feature extractor
-class ImageProcessingMixin(PushToHubMixin):
+class ImageProcessingMixin(PreprocessingMixin):
     """
     This is an image processor mixin used to provide saving/loading functionality for sequential and image feature
     extractors.
     """
 
-    _auto_class = None
-
-    def __init__(self, **kwargs):
-        """Set elements of `kwargs` as attributes."""
-        # This key was saved while we still used `XXXFeatureExtractor` for image processing. Now we use
-        # `XXXImageProcessor`, this attribute and its value are misleading.
-        kwargs.pop("feature_extractor_type", None)
-        # Pop "processor_class", should not be saved with image processing config anymore
-        kwargs.pop("processor_class", None)
-        # Additional attributes without default values
-        for key, value in kwargs.items():
-            try:
-                setattr(self, key, value)
-            except AttributeError as err:
-                logger.error(f"Can't set {key} with value {value} for {self}")
-                raise err
-
-    @classmethod
-    def from_pretrained(
-        cls: type[ImageProcessorType],
-        pretrained_model_name_or_path: str | os.PathLike,
-        cache_dir: str | os.PathLike | None = None,
-        force_download: bool = False,
-        local_files_only: bool = False,
-        token: str | bool | None = None,
-        revision: str = "main",
-        **kwargs,
-    ) -> ImageProcessorType:
-        r"""
-        Instantiate a type of [`~image_processing_utils.ImageProcessingMixin`] from an image processor.
-
-        Args:
-            pretrained_model_name_or_path (`str` or `os.PathLike`):
-                This can be either:
-
-                - a string, the *model id* of a pretrained image_processor hosted inside a model repo on
-                  huggingface.co.
-                - a path to a *directory* containing a image processor file saved using the
-                  [`~image_processing_utils.ImageProcessingMixin.save_pretrained`] method, e.g.,
-                  `./my_model_directory/`.
-                - a path or url to a saved image processor JSON *file*, e.g.,
-                  `./my_model_directory/preprocessor_config.json`.
-            cache_dir (`str` or `os.PathLike`, *optional*):
-                Path to a directory in which a downloaded pretrained model image processor should be cached if the
-                standard cache should not be used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force to (re-)download the image processor files and override the cached versions if
-                they exist.
-            proxies (`dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-            token (`str` or `bool`, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
-                the token generated when running `hf auth login` (stored in `~/.huggingface`).
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
-                identifier allowed by git.
-
-
-                <Tip>
-
-                To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>"`.
-
-                </Tip>
-
-            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
-                If `False`, then this function returns just the final image processor object. If `True`, then this
-                functions returns a `Tuple(image_processor, unused_kwargs)` where *unused_kwargs* is a dictionary
-                consisting of the key/value pairs whose keys are not image processor attributes: i.e., the part of
-                `kwargs` which has not been used to update `image_processor` and is otherwise ignored.
-            subfolder (`str`, *optional*, defaults to `""`):
-                In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
-                specify the folder name here.
-            kwargs (`dict[str, Any]`, *optional*):
-                The values in kwargs of any keys which are image processor attributes will be used to override the
-                loaded values. Behavior concerning key/value pairs whose keys are *not* image processor attributes is
-                controlled by the `return_unused_kwargs` keyword parameter.
-
-        Returns:
-            A image processor of type [`~image_processing_utils.ImageProcessingMixin`].
-
-        Examples:
-
-        ```python
-        # We can't instantiate directly the base class *ImageProcessingMixin* so let's show the examples on a
-        # derived class: *CLIPImageProcessor*
-        image_processor = CLIPImageProcessor.from_pretrained(
-            "openai/clip-vit-base-patch32"
-        )  # Download image_processing_config from huggingface.co and cache.
-        image_processor = CLIPImageProcessor.from_pretrained(
-            "./test/saved_model/"
-        )  # E.g. image processor (or model) was saved using *save_pretrained('./test/saved_model/')*
-        image_processor = CLIPImageProcessor.from_pretrained("./test/saved_model/preprocessor_config.json")
-        image_processor = CLIPImageProcessor.from_pretrained(
-            "openai/clip-vit-base-patch32", do_normalize=False, foo=False
-        )
-        assert image_processor.do_normalize is False
-        image_processor, unused_kwargs = CLIPImageProcessor.from_pretrained(
-            "openai/clip-vit-base-patch32", do_normalize=False, foo=False, return_unused_kwargs=True
-        )
-        assert image_processor.do_normalize is False
-        assert unused_kwargs == {"foo": False}
-        ```"""
-        kwargs["cache_dir"] = cache_dir
-        kwargs["force_download"] = force_download
-        kwargs["local_files_only"] = local_files_only
-        kwargs["revision"] = revision
-
-        if token is not None:
-            kwargs["token"] = token
-
-        image_processor_dict, kwargs = cls.get_image_processor_dict(pretrained_model_name_or_path, **kwargs)
-
-        return cls.from_dict(image_processor_dict, **kwargs)
-
-    def save_pretrained(self, save_directory: str | os.PathLike, push_to_hub: bool = False, **kwargs):
-        """
-        Save an image processor object to the directory `save_directory`, so that it can be re-loaded using the
-        [`~image_processing_utils.ImageProcessingMixin.from_pretrained`] class method.
-
-        Args:
-            save_directory (`str` or `os.PathLike`):
-                Directory where the image processor JSON file will be saved (will be created if it does not exist).
-            push_to_hub (`bool`, *optional*, defaults to `False`):
-                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
-                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
-                namespace).
-            kwargs (`dict[str, Any]`, *optional*):
-                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
-        """
-        if os.path.isfile(save_directory):
-            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
-
-        os.makedirs(save_directory, exist_ok=True)
-
-        if push_to_hub:
-            commit_message = kwargs.pop("commit_message", None)
-            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
-            repo_id = create_repo(repo_id, exist_ok=True, **kwargs).repo_id
-            files_timestamps = self._get_files_timestamps(save_directory)
-
-        # If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
-        # loaded from the Hub.
-        if self._auto_class is not None:
-            custom_object_save(self, save_directory, config=self)
-
-        # If we save using the predefined names, we can load using `from_pretrained`
-        output_image_processor_file = os.path.join(save_directory, IMAGE_PROCESSOR_NAME)
-
-        self.to_json_file(output_image_processor_file)
-        logger.info(f"Image processor saved in {output_image_processor_file}")
-
-        if push_to_hub:
-            self._upload_modified_files(
-                save_directory,
-                repo_id,
-                files_timestamps,
-                commit_message=commit_message,
-                token=kwargs.get("token"),
-            )
-
-        return [output_image_processor_file]
+    _config_name = IMAGE_PROCESSOR_NAME
+    _type_key = "image_processor_type"
+    _nested_config_keys = ["image_processor"]
+    _auto_class_default = "AutoImageProcessor"
+    _file_type_label = "image processor"
+    _excluded_dict_keys = set()
+    _extra_init_pops = ["feature_extractor_type"]
+    _config_filename_kwarg = "image_processor_filename"
+    _subfolder_default = ""
 
     @classmethod
     def get_image_processor_dict(
@@ -248,227 +85,7 @@ def get_image_processor_dict(
         Returns:
             `tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the image processor object.
         """
-        cache_dir = kwargs.pop("cache_dir", None)
-        force_download = kwargs.pop("force_download", False)
-        proxies = kwargs.pop("proxies", None)
-        token = kwargs.pop("token", None)
-        local_files_only = kwargs.pop("local_files_only", False)
-        revision = kwargs.pop("revision", None)
-        subfolder = kwargs.pop("subfolder", "")
-        image_processor_filename = kwargs.pop("image_processor_filename", IMAGE_PROCESSOR_NAME)
-
-        from_pipeline = kwargs.pop("_from_pipeline", None)
-        from_auto_class = kwargs.pop("_from_auto", False)
-
-        user_agent = {"file_type": "image processor", "from_auto_class": from_auto_class}
-        if from_pipeline is not None:
-            user_agent["using_pipeline"] = from_pipeline
-
-        if is_offline_mode() and not local_files_only:
-            logger.info("Offline mode: forcing local_files_only=True")
-            local_files_only = True
-
-        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
-        is_local = os.path.isdir(pretrained_model_name_or_path)
-        if os.path.isdir(pretrained_model_name_or_path):
-            image_processor_file = os.path.join(pretrained_model_name_or_path, image_processor_filename)
-        if os.path.isfile(pretrained_model_name_or_path):
-            resolved_image_processor_file = pretrained_model_name_or_path
-            resolved_processor_file = None
-            is_local = True
-        else:
-            image_processor_file = image_processor_filename
-            try:
-                resolved_processor_file = cached_file(
-                    pretrained_model_name_or_path,
-                    filename=PROCESSOR_NAME,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    proxies=proxies,
-                    local_files_only=local_files_only,
-                    token=token,
-                    user_agent=user_agent,
-                    revision=revision,
-                    subfolder=subfolder,
-                    _raise_exceptions_for_missing_entries=False,
-                )
-                resolved_image_processor_file = cached_file(
-                    pretrained_model_name_or_path,
-                    filename=image_processor_file,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    proxies=proxies,
-                    local_files_only=local_files_only,
-                    token=token,
-                    user_agent=user_agent,
-                    revision=revision,
-                    subfolder=subfolder,
-                    _raise_exceptions_for_missing_entries=False,
-                )
-            except OSError:
-                # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
-                # the original exception.
-                raise
-            except Exception:
-                # For any other exception, we throw a generic error.
-                raise OSError(
-                    f"Can't load image processor for '{pretrained_model_name_or_path}'. If you were trying to load"
-                    " it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
-                    f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
-                    f" directory containing a {image_processor_filename} file"
-                )
-
-        # Load image_processor dict. Priority goes as (nested config if found -> image processor config)
-        # We are downloading both configs because almost all models have a `processor_config.json` but
-        # not all of these are nested. We need to check if it was saved recebtly as nested or if it is legacy style
-        image_processor_dict = None
-        if resolved_processor_file is not None:
-            processor_dict = safe_load_json_file(resolved_processor_file)
-            if "image_processor" in processor_dict:
-                image_processor_dict = processor_dict["image_processor"]
-
-        if resolved_image_processor_file is not None and image_processor_dict is None:
-            image_processor_dict = safe_load_json_file(resolved_image_processor_file)
-
-        if image_processor_dict is None:
-            raise OSError(
-                f"Can't load image processor for '{pretrained_model_name_or_path}'. If you were trying to load"
-                " it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
-                f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
-                f" directory containing a {image_processor_filename} file"
-            )
-
-        if is_local:
-            logger.info(f"loading configuration file {resolved_image_processor_file}")
-        else:
-            logger.info(
-                f"loading configuration file {image_processor_file} from cache at {resolved_image_processor_file}"
-            )
-
-        return image_processor_dict, kwargs
-
-    @classmethod
-    def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs):
-        """
-        Instantiates a type of [`~image_processing_utils.ImageProcessingMixin`] from a Python dictionary of parameters.
-
-        Args:
-            image_processor_dict (`dict[str, Any]`):
-                Dictionary that will be used to instantiate the image processor object. Such a dictionary can be
-                retrieved from a pretrained checkpoint by leveraging the
-                [`~image_processing_utils.ImageProcessingMixin.to_dict`] method.
-            kwargs (`dict[str, Any]`):
-                Additional parameters from which to initialize the image processor object.
-
-        Returns:
-            [`~image_processing_utils.ImageProcessingMixin`]: The image processor object instantiated from those
-            parameters.
-        """
-        image_processor_dict = image_processor_dict.copy()
-        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
-        image_processor_dict.update({k: v for k, v in kwargs.items() if k in cls.valid_kwargs.__annotations__})
-        image_processor = cls(**image_processor_dict)
-
-        # Apply extra kwargs to instance (BC for remote code, e.g. phi4_multimodal)
-        extra_keys = []
-        for key in reversed(list(kwargs.keys())):
-            if hasattr(image_processor, key) and key not in cls.valid_kwargs.__annotations__:
-                setattr(image_processor, key, kwargs.pop(key, None))
-                extra_keys.append(key)
-        if extra_keys:
-            logger.warning_once(
-                f"Image processor {cls.__name__}: kwargs {extra_keys} were applied for backward compatibility. "
-                f"To avoid this warning, add them to valid_kwargs: create a custom TypedDict extending "
-                f"ImagesKwargs with these keys and set it as the `valid_kwargs` class attribute."
-            )
-
-        logger.info(f"Image processor {image_processor}")
-        if return_unused_kwargs:
-            return image_processor, kwargs
-        else:
-            return image_processor
-
-    def to_dict(self) -> dict[str, Any]:
-        """
-        Serializes this instance to a Python dictionary.
-
-        Returns:
-            `dict[str, Any]`: Dictionary of all the attributes that make up this image processor instance.
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["image_processor_type"] = self.__class__.__name__
-
-        return output
-
-    @classmethod
-    def from_json_file(cls, json_file: str | os.PathLike):
-        """
-        Instantiates a image processor of type [`~image_processing_utils.ImageProcessingMixin`] from the path to a JSON
-        file of parameters.
-
-        Args:
-            json_file (`str` or `os.PathLike`):
-                Path to the JSON file containing the parameters.
-
-        Returns:
-            A image processor of type [`~image_processing_utils.ImageProcessingMixin`]: The image_processor object
-            instantiated from that JSON file.
-        """
-        with open(json_file, encoding="utf-8") as reader:
-            text = reader.read()
-        image_processor_dict = json.loads(text)
-        return cls(**image_processor_dict)
-
-    def to_json_string(self) -> str:
-        """
-        Serializes this instance to a JSON string.
-
-        Returns:
-            `str`: String containing all the attributes that make up this feature_extractor instance in JSON format.
-        """
-        dictionary = self.to_dict()
-
-        for key, value in dictionary.items():
-            if isinstance(value, np.ndarray):
-                dictionary[key] = value.tolist()
-
-        return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"
-
-    def to_json_file(self, json_file_path: str | os.PathLike):
-        """
-        Save this instance to a JSON file.
-
-        Args:
-            json_file_path (`str` or `os.PathLike`):
-                Path to the JSON file in which this image_processor instance's parameters will be saved.
-        """
-        with open(json_file_path, "w", encoding="utf-8") as writer:
-            writer.write(self.to_json_string())
-
-    def __repr__(self):
-        return f"{self.__class__.__name__} {self.to_json_string()}"
-
-    @classmethod
-    def register_for_auto_class(cls, auto_class="AutoImageProcessor"):
-        """
-        Register this class with a given auto class. This should only be used for custom image processors as the ones
-        in the library are already mapped with `AutoImageProcessor `.
-
-
-
-        Args:
-            auto_class (`str` or `type`, *optional*, defaults to `"AutoImageProcessor "`):
-                The auto class to register this new image processor with.
-        """
-        if not isinstance(auto_class, str):
-            auto_class = auto_class.__name__
-
-        import transformers.models.auto as auto_module
-
-        if not hasattr(auto_module, auto_class):
-            raise ValueError(f"{auto_class} is not a valid auto class.")
-
-        cls._auto_class = auto_class
+        return cls._get_config_dict(pretrained_model_name_or_path, **kwargs)
 
     def fetch_images(self, image_url_or_urls: str | list[str] | list[list[str]]):
         """
diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 9756866b6333..9fb1d9761ee1 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -14,7 +14,6 @@
 
 import math
 from collections.abc import Iterable
-from copy import deepcopy
 from functools import partial
 from typing import Any
 
@@ -193,20 +192,11 @@ class MyImageProcessor(TorchvisionBackend):
     def __init__(self, **kwargs: Unpack[ImagesKwargs]):
         super().__init__(**kwargs)
 
-        attributes = {}
-        for key in self.valid_kwargs.__annotations__:
-            kwarg = kwargs.pop(key, None)
-            if kwarg is not None:
-                attributes[key] = kwarg
-            else:
-                attributes[key] = deepcopy(getattr(self, key, None))
+        attributes = {key: getattr(self, key) for key in self._valid_kwargs_names}
         attributes = self._standardize_kwargs(**attributes)
         for key, value in attributes.items():
             setattr(self, key, value)
 
-        # get valid kwargs names
-        self._valid_kwargs_names = list(self.valid_kwargs.__annotations__.keys())
-
     def __call__(self, images: ImageInput, *args, **kwargs: Unpack[ImagesKwargs]) -> BatchFeature:
         """Preprocess an image or a batch of images."""
         return self.preprocess(images, *args, **kwargs)
diff --git a/src/transformers/models/audio_spectrogram_transformer/audio_processing_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/audio_processing_audio_spectrogram_transformer.py
new file mode 100644
index 000000000000..7d9ba2cddec7
--- /dev/null
+++ b/src/transformers/models/audio_spectrogram_transformer/audio_processing_audio_spectrogram_transformer.py
@@ -0,0 +1,72 @@
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ...audio_processing_backends import NumpyAudioBackend
+from ...audio_utils import MelScaleConfig, SpectrogramConfig, StftConfig
+
+class AudioSpectrogramTransformerAudioProcessor(NumpyAudioBackend):
+    sample_rate = 16000
+    force_mono = True
+    return_padding_mask = False
+    do_batch_spectrogram = False
+
+    max_length_frames = 1024
+    do_normalize = True
+
+    # AudioSet normalization constants
+    ast_mean = -4.2677393
+    ast_std = 4.5689974
+
+    spectrogram_config = SpectrogramConfig(
+        stft_config=StftConfig(
+            n_fft=512,
+            win_length=400,
+            hop_length=160,
+            window_fn="hann_window",
+            power=2.0,
+            center=False,
+            periodic=False,
+        ),
+        mel_scale_config=MelScaleConfig(
+            n_mels=128,
+            f_min=20.0,
+            f_max=8000.0,
+            mel_scale="kaldi",
+            triangularize_in_mel_space=True,
+        ),
+        log_mode="log",
+        preemphasis=0.97,
+        remove_dc_offset=True,
+        mel_floor=1.192092955078125e-07,
+    )
+
+    def extract_spectrogram(self, audio, **kwargs):
+        return [self._kaldi_fbank(waveform, num_mel_bins=128, window_type="hanning") for waveform in audio]
+
+    def _pad_features(self, features, padding, max_length, truncation, pad_to_multiple_of):
+        # Always pad/truncate to max_length_frames regardless of caller's padding args
+        return super()._pad_features(features, "max_length", self.max_length_frames, True, pad_to_multiple_of)
+
+    def _postprocess_output(self, output, **kwargs):
+        # Rename to audio_values (AST convention) and apply AudioSet normalization
+        features = output.pop("audio_features")
+        if self.do_normalize:
+            features = (features - self.ast_mean) / (self.ast_std * 2)
+        output["audio_values"] = features
+        return output
+
+
+__all__ = ["AudioSpectrogramTransformerAudioProcessor"]
diff --git a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
index ee69d1d0b991..80faf5663dec 100644
--- a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
@@ -11,225 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
-Feature extractor class for Audio Spectrogram Transformer.
-"""
+from ...utils.deprecation import deprecated_feature_extractor
+from .audio_processing_audio_spectrogram_transformer import AudioSpectrogramTransformerAudioProcessor
 
-import numpy as np
-
-from ...audio_utils import mel_filter_bank, spectrogram, window_function
-from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
-from ...feature_extraction_utils import BatchFeature
-from ...utils import TensorType, is_speech_available, is_torch_available, logging
-
-
-if is_speech_available():
-    import torchaudio.compliance.kaldi as ta_kaldi
-
-if is_torch_available():
-    import torch
-
-
-logger = logging.get_logger(__name__)
-
-
-class ASTFeatureExtractor(SequenceFeatureExtractor):
-    r"""
-    Constructs a Audio Spectrogram Transformer (AST) feature extractor.
-
-    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
-    most of the main methods. Users should refer to this superclass for more information regarding those methods.
-
-    This class extracts mel-filter bank features from raw speech using TorchAudio if installed or using numpy
-    otherwise, pads/truncates them to a fixed length and normalizes them using a mean and standard deviation.
-
-    Args:
-        feature_size (`int`, *optional*, defaults to 1):
-            The feature dimension of the extracted features.
-        sampling_rate (`int`, *optional*, defaults to 16000):
-            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
-        num_mel_bins (`int`, *optional*, defaults to 128):
-            Number of Mel-frequency bins.
-        max_length (`int`, *optional*, defaults to 1024):
-            Maximum length to which to pad/truncate the extracted features.
-        do_normalize (`bool`, *optional*, defaults to `True`):
-            Whether or not to normalize the log-Mel features using `mean` and `std`.
-        mean (`float`, *optional*, defaults to -4.2677393):
-            The mean value used to normalize the log-Mel features. Uses the AudioSet mean by default.
-        std (`float`, *optional*, defaults to 4.5689974):
-            The standard deviation value used to normalize the log-Mel features. Uses the AudioSet standard deviation
-            by default.
-        return_attention_mask (`bool`, *optional*, defaults to `False`):
-            Whether or not [`~ASTFeatureExtractor.__call__`] should return `attention_mask`.
-    """
-
-    model_input_names = ["input_values", "attention_mask"]
-
-    def __init__(
-        self,
-        feature_size=1,
-        sampling_rate=16000,
-        num_mel_bins=128,
-        max_length=1024,
-        padding_value=0.0,
-        do_normalize=True,
-        mean=-4.2677393,
-        std=4.5689974,
-        return_attention_mask=False,
-        **kwargs,
-    ):
-        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
-        self.num_mel_bins = num_mel_bins
-        self.max_length = max_length
-        self.do_normalize = do_normalize
-        self.mean = mean
-        self.std = std
-        self.return_attention_mask = return_attention_mask
-
-        if not is_speech_available():
-            mel_filters = mel_filter_bank(
-                num_frequency_bins=257,
-                num_mel_filters=self.num_mel_bins,
-                min_frequency=20,
-                max_frequency=sampling_rate // 2,
-                sampling_rate=sampling_rate,
-                norm=None,
-                mel_scale="kaldi",
-                triangularize_in_mel_space=True,
-            )
-
-            self.mel_filters = mel_filters
-            self.window = window_function(400, "hann", periodic=False)
-
-    def _extract_fbank_features(
-        self,
-        waveform: np.ndarray,
-        max_length: int,
-    ) -> np.ndarray:
-        """
-        Get mel-filter bank features using TorchAudio. Note that TorchAudio requires 16-bit signed integers as inputs
-        and hence the waveform should not be normalized before feature extraction.
-        """
-        # waveform = waveform * (2**15)  # Kaldi compliance: 16-bit signed integers
-        if is_speech_available():
-            waveform = torch.from_numpy(waveform).unsqueeze(0)
-            fbank = ta_kaldi.fbank(
-                waveform,
-                sample_frequency=self.sampling_rate,
-                window_type="hanning",
-                num_mel_bins=self.num_mel_bins,
-            )
-        else:
-            waveform = np.squeeze(waveform)
-            fbank = spectrogram(
-                waveform,
-                self.window,
-                frame_length=400,
-                hop_length=160,
-                fft_length=512,
-                power=2.0,
-                center=False,
-                preemphasis=0.97,
-                mel_filters=self.mel_filters,
-                log_mel="log",
-                mel_floor=1.192092955078125e-07,
-                remove_dc_offset=True,
-            ).T
-
-            fbank = torch.from_numpy(fbank)
-
-        n_frames = fbank.shape[0]
-        difference = max_length - n_frames
-
-        # pad or truncate, depending on difference
-        if difference > 0:
-            pad_module = torch.nn.ZeroPad2d((0, 0, 0, difference))
-            fbank = pad_module(fbank)
-        elif difference < 0:
-            fbank = fbank[0:max_length, :]
-
-        fbank = fbank.numpy()
-
-        return fbank
-
-    def normalize(self, input_values: np.ndarray) -> np.ndarray:
-        return (input_values - (self.mean)) / (self.std * 2)
-
-    def __call__(
-        self,
-        raw_speech: np.ndarray | list[float] | list[np.ndarray] | list[list[float]],
-        sampling_rate: int | None = None,
-        return_tensors: str | TensorType | None = None,
-        **kwargs,
-    ) -> BatchFeature:
-        """
-        Main method to featurize and prepare for the model one or several sequence(s).
-
-        Args:
-            raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
-                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
-                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
-                stereo, i.e. single float per timestep.
-            sampling_rate (`int`, *optional*):
-                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
-                `sampling_rate` at the forward call to prevent silent errors.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors instead of list of python integers. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return Numpy `np.ndarray` objects.
-        """
-
-        if sampling_rate is not None:
-            if sampling_rate != self.sampling_rate:
-                raise ValueError(
-                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
-                    f" {self.sampling_rate}. Please make sure that the provided `raw_speech` input was sampled with"
-                    f" {self.sampling_rate} and not {sampling_rate}."
-                )
-        else:
-            logger.warning(
-                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
-                "Failing to do so can result in silent errors that might be hard to debug."
-            )
-
-        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
-        if is_batched_numpy and len(raw_speech.shape) > 2:
-            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
-        is_batched = is_batched_numpy or (
-            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
-        )
-
-        if is_batched:
-            raw_speech = [np.asarray(speech, dtype=np.float32) for speech in raw_speech]
-        elif not is_batched and not isinstance(raw_speech, np.ndarray):
-            raw_speech = np.asarray(raw_speech, dtype=np.float32)
-        elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
-            raw_speech = raw_speech.astype(np.float32)
-
-        # always return batch
-        if not is_batched:
-            raw_speech = [raw_speech]
-
-        # extract fbank features and pad/truncate to max_length
-        features = [self._extract_fbank_features(waveform, max_length=self.max_length) for waveform in raw_speech]
-
-        # convert into BatchFeature
-        padded_inputs = BatchFeature({"input_values": features})
-
-        # make sure list is in array format
-        input_values = padded_inputs.get("input_values")
-        if isinstance(input_values[0], list):
-            padded_inputs["input_values"] = [np.asarray(feature, dtype=np.float32) for feature in input_values]
-
-        # normalization
-        if self.do_normalize:
-            padded_inputs["input_values"] = [self.normalize(feature) for feature in input_values]
-
-        if return_tensors is not None:
-            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
-
-        return padded_inputs
 
+ASTFeatureExtractor = deprecated_feature_extractor(
+    AudioSpectrogramTransformerAudioProcessor, "ASTFeatureExtractor"
+)
 
 __all__ = ["ASTFeatureExtractor"]
diff --git a/src/transformers/models/clap/audio_processing_clap.py b/src/transformers/models/clap/audio_processing_clap.py
new file mode 100644
index 000000000000..20525c3bfce3
--- /dev/null
+++ b/src/transformers/models/clap/audio_processing_clap.py
@@ -0,0 +1,129 @@
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ...audio_processing_backends import NumpyAudioBackend
+from ...audio_utils import MelScaleConfig, SpectrogramConfig, StftConfig
+from ...utils import PaddingStrategy
+
+
+class ClapAudioProcessor(NumpyAudioBackend):
+    sample_rate = 48000
+    force_mono = True
+    max_length = 480000
+    truncation_mode = "rand_trunc"  # "fusion" or "rand_trunc"
+
+    _mel_configs = {
+        "rand_trunc": MelScaleConfig(n_mels=64, f_min=50, f_max=14000, mel_scale="slaney", norm="slaney", frequency_bin_mode="linspace"),
+        "fusion": MelScaleConfig(n_mels=64, f_min=50, f_max=14000, mel_scale="htk", frequency_bin_mode="linspace"),
+    }
+
+    def __init__(self, **kwargs):
+        truncation_mode = kwargs.pop("truncation_mode", self.truncation_mode)
+        self.truncation_mode = truncation_mode
+        self.spectrogram_config = SpectrogramConfig(
+            stft_config=StftConfig(n_fft=1024, hop_length=480, power=2.0),
+            mel_scale_config=self._mel_configs[truncation_mode],
+            log_mode="dB",
+            computation_dtype="float64",
+        )
+        super().__init__(**kwargs)
+        # rand_trunc: base class truncates via pad() → _truncate_single (random offset)
+        # fusion: no pre-truncation; full mel is extracted then chunked
+        self.truncation = truncation_mode == "rand_trunc"
+
+    def _get_padding_strategies(self, padding=False, max_length=None):
+        # CLAP always pads to max_length, not to the longest in the batch
+        if padding is True and max_length is not None:
+            return PaddingStrategy.MAX_LENGTH
+        return super()._get_padding_strategies(padding=padding, max_length=max_length)
+
+    def pad(self, audio, *args, **kwargs):
+        self._is_longer_flags = []
+        return super().pad(audio, *args, **kwargs)
+
+    def _truncate_single(self, audio_el, max_length):
+        """Random-offset truncation for rand_trunc mode, also tracks which samples were longer."""
+        self._is_longer_flags.append(audio_el.shape[-1] > max_length)
+        if audio_el.shape[-1] > max_length:
+            idx = np.random.randint(0, audio_el.shape[-1] - max_length + 1)
+            return audio_el[..., idx : idx + max_length]
+        return audio_el
+
+    def extract_spectrogram(self, audio, *, spectrogram_config=None, audio_ranges=None, **kwargs):
+        """Extract mel spectrogram and shape output (1 view for rand_trunc, 4 for fusion)."""
+        is_fusion = self.truncation_mode == "fusion"
+        chunk_frames = self.max_length // self.spectrogram_config.stft_config.hop_length + 1
+
+        if isinstance(audio, np.ndarray) and audio.ndim == 2:
+            waveforms = list(audio)
+        elif isinstance(audio, np.ndarray) and audio.ndim == 1:
+            waveforms = [audio]
+        else:
+            waveforms = audio
+
+        mels = []
+        is_longer = []
+        for waveform in waveforms:
+            mel = super().extract_spectrogram(waveform, spectrogram_config=self.spectrogram_config).T  # (time, n_mels)
+            total_frames = mel.shape[0]
+
+            if is_fusion and total_frames > chunk_frames:
+                mels.append(self._random_mel_fusion(mel, total_frames, chunk_frames))
+                is_longer.append(True)
+            elif is_fusion:
+                mels.append(np.stack([mel, mel, mel, mel], axis=0))
+                is_longer.append(False)
+            else:
+                mels.append(mel[np.newaxis])
+                is_longer.append(False)
+
+        if is_fusion:
+            self._is_longer_flags = is_longer
+        return mels
+
+    def _random_mel_fusion(self, mel, total_frames, chunk_frames):
+        import torch
+
+        ranges = np.array_split(list(range(0, total_frames - chunk_frames + 1)), 3)
+        if len(ranges[1]) == 0:
+            ranges[1] = [0]
+        if len(ranges[2]) == 0:
+            ranges[2] = [0]
+        idx_front = np.random.choice(ranges[0])
+        idx_middle = np.random.choice(ranges[1])
+        idx_back = np.random.choice(ranges[2])
+
+        mel_chunk_front = mel[idx_front : idx_front + chunk_frames, :]
+        mel_chunk_middle = mel[idx_middle : idx_middle + chunk_frames, :]
+        mel_chunk_back = mel[idx_back : idx_back + chunk_frames, :]
+
+        mel_tensor = torch.tensor(mel[None, None, :])
+        mel_shrink = torch.nn.functional.interpolate(
+            mel_tensor, size=[chunk_frames, 64], mode="bilinear", align_corners=False
+        )
+        mel_shrink = mel_shrink[0][0].numpy()
+        return np.stack([mel_shrink, mel_chunk_front, mel_chunk_middle, mel_chunk_back], axis=0)
+
+    def _get_mask(self, audio_ranges, padded_length, do_extract_spectrogram, spectrogram_config):
+        """Return CLAP's is_longer flag instead of a standard attention mask."""
+        is_longer = getattr(self, "_is_longer_flags", None) or [False] * len(audio_ranges)
+        if self.truncation_mode == "fusion" and sum(is_longer) == 0:
+            rand_idx = np.random.randint(0, len(is_longer))
+            is_longer[rand_idx] = True
+        return {"is_longer": [[longer] for longer in is_longer]}
+
+
+__all__ = ["ClapAudioProcessor"]
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index 8f0a34d2cf4e..79c3c9353825 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -11,354 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Feature extractor class for CLAP."""
+from ...utils.deprecation import deprecated_feature_extractor
+from .audio_processing_clap import ClapAudioProcessor
 
-import copy
-from typing import Any
-
-import numpy as np
-import torch
-
-from ...audio_utils import mel_filter_bank, spectrogram, window_function
-from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
-from ...feature_extraction_utils import BatchFeature
-from ...utils import TensorType, logging
-from ...utils.import_utils import requires
-
-
-logger = logging.get_logger(__name__)
-
-
-@requires(backends=("torch",))
-class ClapFeatureExtractor(SequenceFeatureExtractor):
-    r"""
-    Constructs a CLAP feature extractor.
-
-    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
-    most of the main methods. Users should refer to this superclass for more information regarding those methods.
-
-    This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the *Short Time
-    Fourier Transform* (STFT) which should match pytorch's `torch.stft` equivalent.
-
-    Args:
-        feature_size (`int`, *optional*, defaults to 64):
-            The feature dimension of the extracted Mel spectrograms. This corresponds to the number of mel filters
-            (`n_mels`).
-        sampling_rate (`int`, *optional*, defaults to 48000):
-            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz). This only serves
-            to warn users if the audio fed to the feature extractor does not have the same sampling rate.
-        hop_length (`int`,*optional*, defaults to 480):
-            Length of the overlapping windows for the STFT used to obtain the Mel Spectrogram. The audio will be split
-            in smaller `frames` with a step of `hop_length` between each frame.
-        max_length_s (`int`, *optional*, defaults to 10):
-            The maximum input length of the model in seconds. This is used to pad the audio.
-        fft_window_size (`int`, *optional*, defaults to 1024):
-            Size of the window (in samples) on which the Fourier transform is applied. This controls the frequency
-            resolution of the spectrogram. 400 means that the fourier transform is computed on windows of 400 samples.
-        padding_value (`float`, *optional*, defaults to 0.0):
-            Padding value used to pad the audio. Should correspond to silences.
-        return_attention_mask (`bool`, *optional*, defaults to `False`):
-            Whether or not the model should return the attention masks corresponding to the input.
-        frequency_min (`float`, *optional*, defaults to 0):
-            The lowest frequency of interest. The STFT will not be computed for values below this.
-        frequency_max (`float`, *optional*, defaults to 14000):
-            The highest frequency of interest. The STFT will not be computed for values above this.
-        top_db (`float`, *optional*):
-            The highest decibel value used to convert the mel spectrogram to the log scale. For more details see the
-            `audio_utils.power_to_db` function
-        truncation (`str`, *optional*, defaults to `"fusion"`):
-            Truncation pattern for long audio inputs. Two patterns are available:
-                - `fusion` will use `_random_mel_fusion`, which stacks 3 random crops from the mel spectrogram and a
-                  downsampled version of the entire mel spectrogram.
-            If `config.fusion` is set to True, shorter audios also need to return 4 mels, which will just be a copy
-            of the original mel obtained from the padded audio.
-                - `rand_trunc` will select a random crop of the mel spectrogram.
-        padding (`str`, *optional*, defaults to `"repeatpad"`):
-               Padding pattern for shorter audio inputs. Three patterns were originally implemented:
-                - `repeatpad`: the audio is repeated, and then padded to fit the `max_length`.
-                - `repeat`: the audio is repeated and then cut to fit the `max_length`
-                - `pad`: the audio is padded.
-    """
-
-    model_input_names = ["input_features", "is_longer"]
-
-    def __init__(
-        self,
-        feature_size=64,
-        sampling_rate=48_000,
-        hop_length=480,
-        max_length_s=10,
-        fft_window_size=1024,
-        padding_value=0.0,
-        return_attention_mask=False,  # pad inputs to max length with silence token (zero) and no attention mask
-        frequency_min: float = 0,
-        frequency_max: float = 14_000,
-        top_db: int | None = None,
-        truncation: str = "fusion",
-        padding: str = "repeatpad",
-        **kwargs,
-    ):
-        super().__init__(
-            feature_size=feature_size,
-            sampling_rate=sampling_rate,
-            padding_value=padding_value,
-            return_attention_mask=return_attention_mask,
-            **kwargs,
-        )
-        self.top_db = top_db
-        self.truncation = truncation
-        self.padding = padding
-        self.fft_window_size = fft_window_size
-        self.nb_frequency_bins = (fft_window_size >> 1) + 1
-        self.hop_length = hop_length
-        self.max_length_s = max_length_s
-        self.nb_max_samples = max_length_s * sampling_rate
-        self.sampling_rate = sampling_rate
-        self.frequency_min = frequency_min
-        self.frequency_max = frequency_max
-        self.mel_filters = mel_filter_bank(
-            num_frequency_bins=self.nb_frequency_bins,
-            num_mel_filters=feature_size,
-            min_frequency=frequency_min,
-            max_frequency=frequency_max,
-            sampling_rate=sampling_rate,
-            norm=None,
-            mel_scale="htk",
-        )
-        self.mel_filters_slaney = mel_filter_bank(
-            num_frequency_bins=self.nb_frequency_bins,
-            num_mel_filters=feature_size,
-            min_frequency=frequency_min,
-            max_frequency=frequency_max,
-            sampling_rate=sampling_rate,
-            norm="slaney",
-            mel_scale="slaney",
-        )
-
-    def to_dict(self) -> dict[str, Any]:
-        """
-        Serializes this instance to a Python dictionary.
-
-        Returns:
-            `dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, except for the
-            mel filter banks, which do not need to be saved or printed as they are too long.
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["feature_extractor_type"] = self.__class__.__name__
-        if "mel_filters" in output:
-            del output["mel_filters"]
-        if "mel_filters_slaney" in output:
-            del output["mel_filters_slaney"]
-        return output
-
-    def _np_extract_fbank_features(self, waveform: np.ndarray, mel_filters: np.ndarray | None = None) -> np.ndarray:
-        """
-        Compute the log-mel spectrogram of the provided `waveform` using the Hann window. In CLAP, two different filter
-        banks are used depending on the truncation pattern:
-            - `self.mel_filters`: they correspond to the default parameters of `torchaudio` which can be obtained from
-              calling `torchaudio.transforms.MelSpectrogram().mel_scale.fb`. These filters are used when `truncation`
-              is set to `"fusion"`.
-            - `self.mel_filteres_slaney` : they correspond to the default parameters of `librosa` which used
-              `librosa.filters.mel` when computing the mel spectrogram. These filters were only used in the original
-              implementation when the truncation mode is not `"fusion"`.
-        """
-        log_mel_spectrogram = spectrogram(
-            waveform,
-            window_function(self.fft_window_size, "hann"),
-            frame_length=self.fft_window_size,
-            hop_length=self.hop_length,
-            power=2.0,
-            mel_filters=mel_filters,
-            log_mel="dB",
-        )
-        return log_mel_spectrogram.T
-
-    def _random_mel_fusion(self, mel, total_frames, chunk_frames):
-        ranges = np.array_split(list(range(0, total_frames - chunk_frames + 1)), 3)
-        if len(ranges[1]) == 0:
-            # if the audio is too short, we just use the first chunk
-            ranges[1] = [0]
-        if len(ranges[2]) == 0:
-            # if the audio is too short, we just use the first chunk
-            ranges[2] = [0]
-        # randomly choose index for each part
-        idx_front = np.random.choice(ranges[0])
-        idx_middle = np.random.choice(ranges[1])
-        idx_back = np.random.choice(ranges[2])
-
-        mel_chunk_front = mel[idx_front : idx_front + chunk_frames, :]
-        mel_chunk_middle = mel[idx_middle : idx_middle + chunk_frames, :]
-        mel_chunk_back = mel[idx_back : idx_back + chunk_frames, :]
-
-        mel = torch.tensor(mel[None, None, :])
-        mel_shrink = torch.nn.functional.interpolate(
-            mel, size=[chunk_frames, 64], mode="bilinear", align_corners=False
-        )
-        mel_shrink = mel_shrink[0][0].numpy()
-        mel_fusion = np.stack([mel_shrink, mel_chunk_front, mel_chunk_middle, mel_chunk_back], axis=0)
-        return mel_fusion
-
-    def _get_input_mel(self, waveform: np.ndarray, max_length, truncation, padding) -> np.ndarray:
-        """
-        Extracts the mel spectrogram and prepares it for the mode based on the `truncation` and `padding` arguments.
-        Four different path are possible:
-            - `truncation="fusion"` and the length of the waveform is greater than the max length: the mel spectrogram
-              will be computed on the entire audio. 3 random crops and a dowsampled version of the full mel spectrogram
-              are then stacked together. They will later be used for `feature_fusion`.
-            - `truncation="rand_trunc"` and the length of the waveform is smaller than the max length: the audio is
-              padded based on `padding`.
-            - `truncation="fusion"` and the length of the waveform is smaller than the max length: the audio is padded
-              based on `padding`, and is repeated `4` times.
-            - `truncation="rand_trunc"` and the length of the waveform is greater than the max length: the mel
-              spectrogram will be computed on a random crop of the waveform.
-
-        """
-        if waveform.shape[0] > max_length:
-            if truncation == "rand_trunc":
-                longer = True
-                # random crop to max_length (for compatibility) -> this should be handled by self.pad
-                overflow = len(waveform) - max_length
-                idx = np.random.randint(0, overflow + 1)
-                waveform = waveform[idx : idx + max_length]
-                input_mel = self._np_extract_fbank_features(waveform, self.mel_filters_slaney)[None, :]
-            elif truncation == "fusion":
-                mel = self._np_extract_fbank_features(waveform, self.mel_filters)
-                chunk_frames = max_length // self.hop_length + 1  # the +1 related to how the spectrogram is computed
-                total_frames = mel.shape[0]
-                if chunk_frames == total_frames:
-                    # there is a corner case where the audio length is larger than max_length but smaller than max_length+hop_length.
-                    # In this case, we just use the whole audio.
-                    input_mel = np.stack([mel, mel, mel, mel], axis=0)
-                    longer = False
-                else:
-                    input_mel = self._random_mel_fusion(mel, total_frames, chunk_frames)
-                    longer = True
-            else:
-                raise NotImplementedError(f"data_truncating {truncation} not implemented")
-
-        else:
-            longer = False
-            # only use repeat as a new possible value for padding. you repeat the audio before applying the usual max_length padding
-            if waveform.shape[0] < max_length:
-                if padding == "repeat":
-                    n_repeat = int(max_length / len(waveform))
-                    waveform = np.tile(waveform, n_repeat + 1)[:max_length]
-                if padding == "repeatpad":
-                    n_repeat = int(max_length / len(waveform))
-                    waveform = np.tile(waveform, n_repeat)
-                waveform = np.pad(waveform, (0, max_length - waveform.shape[0]), mode="constant", constant_values=0)
-
-            if truncation == "fusion":
-                input_mel = self._np_extract_fbank_features(waveform, self.mel_filters)
-                input_mel = np.stack([input_mel, input_mel, input_mel, input_mel], axis=0)
-            else:
-                input_mel = self._np_extract_fbank_features(waveform, self.mel_filters_slaney)[None, :]
-
-        return input_mel, longer
-
-    def __call__(
-        self,
-        raw_speech: np.ndarray | list[float] | list[np.ndarray] | list[list[float]],
-        truncation: str | None = None,
-        padding: str | None = None,
-        max_length: int | None = None,
-        sampling_rate: int | None = None,
-        return_tensors: str | TensorType | None = None,
-        **kwargs,
-    ) -> BatchFeature:
-        """
-        Main method to featurize and prepare for the model one or several sequence(s).
-
-        Args:
-            raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
-                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
-                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
-                stereo, i.e. single float per timestep.
-            truncation (`str`, *optional*):
-                Truncation pattern for long audio inputs. Two patterns are available:
-                    - `fusion` will use `_random_mel_fusion`, which stacks 3 random crops from the mel spectrogram and
-                      a downsampled version of the entire mel spectrogram.
-                If `config.fusion` is set to True, shorter audios also need to return 4 mels, which will just be a
-                copy of the original mel obtained from the padded audio.
-                    - `rand_trunc` will select a random crop of the mel spectrogram.
-            padding (`str`, *optional*):
-               Padding pattern for shorter audio inputs. Three patterns were originally implemented:
-                    - `repeatpad`: the audio is repeated, and then padded to fit the `max_length`.
-                    - `repeat`: the audio is repeated and then cut to fit the `max_length`
-                    - `pad`: the audio is padded.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors instead of list of python integers. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.np.array` objects.
-                - `'np'`: Return Numpy `np.ndarray` objects.
-            sampling_rate (`int`, *optional*):
-                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
-                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
-                pipeline.
-        """
-        truncation = truncation if truncation is not None else self.truncation
-        padding = padding if padding else self.padding
-
-        if sampling_rate is not None:
-            if sampling_rate != self.sampling_rate:
-                raise ValueError(
-                    f"The model corresponding to this feature extractor: {self.__class__.__name__} was trained using a"
-                    f" sampling rate of {self.sampling_rate}. Please make sure that the provided `raw_speech` input"
-                    f" was sampled with {self.sampling_rate} and not {sampling_rate}."
-                )
-        else:
-            logger.warning(
-                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
-                "Failing to do so can result in silent errors that might be hard to debug."
-            )
-
-        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
-        if is_batched_numpy and len(raw_speech.shape) > 2:
-            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
-        is_batched = is_batched_numpy or (
-            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
-        )
-
-        if is_batched:
-            raw_speech = [np.asarray(speech, dtype=np.float64) for speech in raw_speech]
-        elif not is_batched and not isinstance(raw_speech, np.ndarray):
-            raw_speech = np.asarray(raw_speech, dtype=np.float64)
-        elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
-            raw_speech = raw_speech.astype(np.float64)
-
-        # always return batch
-        if not is_batched:
-            raw_speech = [np.asarray(raw_speech)]
-
-        # convert to mel spectrogram, truncate and pad if needed.
-        padded_inputs = [
-            self._get_input_mel(waveform, max_length if max_length else self.nb_max_samples, truncation, padding)
-            for waveform in raw_speech
-        ]
-
-        input_mel = []
-        is_longer = []
-        for mel, longer in padded_inputs:
-            input_mel.append(mel)
-            is_longer.append(longer)
-
-        if truncation == "fusion" and sum(is_longer) == 0:
-            # if no audio is longer than 10s, then randomly select one audio to be longer
-            rand_idx = np.random.randint(0, len(input_mel))
-            is_longer[rand_idx] = True
-
-        if isinstance(input_mel[0], list):
-            input_mel = [np.asarray(feature, dtype=np.float64) for feature in input_mel]
-
-        # is_longer is a list of bool
-        is_longer = [[longer] for longer in is_longer]
-
-        input_features = {"input_features": input_mel, "is_longer": is_longer}
-        input_features = BatchFeature(input_features)
-
-        if return_tensors is not None:
-            input_features = input_features.convert_to_tensors(return_tensors)
-
-        return input_features
 
+ClapFeatureExtractor = deprecated_feature_extractor(ClapAudioProcessor, "ClapFeatureExtractor")
 
 __all__ = ["ClapFeatureExtractor"]
diff --git a/src/transformers/models/clvp/audio_processing_clvp.py b/src/transformers/models/clvp/audio_processing_clvp.py
new file mode 100644
index 000000000000..6272c795d1fd
--- /dev/null
+++ b/src/transformers/models/clvp/audio_processing_clvp.py
@@ -0,0 +1,61 @@
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ...audio_processing_backends import NumpyAudioBackend
+from ...audio_utils import MelScaleConfig, SpectrogramConfig, StftConfig
+
+
+class ClvpAudioProcessor(NumpyAudioBackend):
+    sample_rate = 22050
+    force_mono = True
+    max_length = 132300  # 6 seconds at 22050 Hz
+    truncation = True
+    mask_level = "audio"
+
+    spectrogram_config = SpectrogramConfig(
+        stft_config=StftConfig(
+            n_fft=1024,
+            hop_length=256,
+            window_fn="hann_window",
+            power=2.0,
+        ),
+        mel_scale_config=MelScaleConfig(
+            n_mels=80,
+            f_min=0.0,
+            f_max=8000.0,
+            norm="slaney",
+            mel_scale="htk",
+            frequency_bin_mode="linspace",
+        ),
+        log_mode="log",
+        mel_floor=1e-5,
+        computation_dtype="float64",
+    )
+
+    def __init__(self, mel_norms=None, **kwargs):
+        super().__init__(**kwargs)
+        self.mel_norms = mel_norms
+
+    def _normalize_magnitude(self, features, *, spectrogram_config, **kwargs):
+        # Compute log and mel_norms division in float64 before casting to float32
+        # to match the legacy feature extractor's precision
+        mel_floor = spectrogram_config.mel_floor
+        features = np.log(np.maximum(mel_floor, features))
+        if self.mel_norms is not None:
+            features = features / np.array(self.mel_norms)[:, None]
+        return features.astype(np.float32)
+
+__all__ = ["ClvpAudioProcessor"]
diff --git a/src/transformers/models/clvp/feature_extraction_clvp.py b/src/transformers/models/clvp/feature_extraction_clvp.py
index cc39e6aca677..e5966a9b2f02 100644
--- a/src/transformers/models/clvp/feature_extraction_clvp.py
+++ b/src/transformers/models/clvp/feature_extraction_clvp.py
@@ -11,227 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from ...utils.deprecation import deprecated_feature_extractor
+from .audio_processing_clvp import ClvpAudioProcessor
 
-"""
-Feature extractor class for CLVP
-"""
-
-import numpy as np
-
-from ...audio_utils import mel_filter_bank, spectrogram, window_function
-from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
-from ...feature_extraction_utils import BatchFeature
-from ...utils import TensorType, logging
-
-
-logger = logging.get_logger(__name__)
-
-
-class ClvpFeatureExtractor(SequenceFeatureExtractor):
-    r"""
-    Constructs a CLVP feature extractor.
-
-    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
-    most of the main methods. Users should refer to this superclass for more information regarding those methods.
-
-    This class extracts log-mel-spectrogram features from raw speech using a custom numpy implementation of the `Short
-    Time Fourier Transform` which should match pytorch's `torch.stft` equivalent.
-
-    Args:
-        feature_size (`int`, *optional*, defaults to 80):
-            The feature dimension of the extracted features.
-        sampling_rate (`int`, *optional*, defaults to 22050):
-            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
-        default_audio_length (`int`, *optional*, defaults to 6):
-            The default length of raw audio in seconds. If `max_length` is not set during `__call__` then it will
-            automatically be set to default_audio_length * `self.sampling_rate`.
-        hop_length (`int`, *optional*, defaults to 256):
-            Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
-        chunk_length (`int`, *optional*, defaults to 30):
-            The maximum number of chunks of `sampling_rate` samples used to trim and pad longer or shorter audio
-            sequences.
-        n_fft (`int`, *optional*, defaults to 1024):
-            Size of the Fourier transform.
-        padding_value (`float`, *optional*, defaults to 0.0):
-            Padding value used to pad the audio. Should correspond to silences.
-        mel_norms (`list` of length `feature_size`, *optional*):
-            If `mel_norms` is provided then it will be used to normalize the log-mel spectrograms along each
-            mel-filter.
-        return_attention_mask (`bool`, *optional*, defaults to `False`):
-            Whether to return the attention mask. If left to the default, it will return the attention mask.
-
-            [What are attention masks?](../glossary#attention-mask)
-    """
-
-    model_input_names = ["input_features", "attention_mask"]
-
-    def __init__(
-        self,
-        feature_size=80,
-        sampling_rate=22050,
-        default_audio_length=6,
-        hop_length=256,
-        chunk_length=30,
-        n_fft=1024,
-        padding_value=0.0,
-        mel_norms=None,
-        return_attention_mask=False,  # pad inputs to max length with silence token (zero) and no attention mask
-        **kwargs,
-    ):
-        super().__init__(
-            feature_size=feature_size,
-            sampling_rate=sampling_rate,
-            padding_value=padding_value,
-            return_attention_mask=return_attention_mask,
-            **kwargs,
-        )
-        self.n_fft = n_fft
-        self.hop_length = hop_length
-        self.chunk_length = chunk_length
-        self.n_samples = chunk_length * sampling_rate
-        self.nb_max_frames = self.n_samples // hop_length
-        self.sampling_rate = sampling_rate
-        self.default_audio_length = default_audio_length
-        self.mel_norms = mel_norms
-        self.mel_filters = mel_filter_bank(
-            num_frequency_bins=1 + (n_fft // 2),
-            num_mel_filters=feature_size,
-            min_frequency=0.0,
-            max_frequency=8000.0,
-            sampling_rate=sampling_rate,
-            norm="slaney",
-            mel_scale="htk",
-        )
-
-    def _np_extract_fbank_features(self, waveform: np.ndarray) -> np.ndarray:
-        """
-        This method first computes the log-mel spectrogram of the provided audio then applies normalization along the
-        each mel-filterbank, if `mel_norms` is provided.
-        """
-        log_spec = spectrogram(
-            waveform,
-            window_function(self.n_fft, "hann"),
-            frame_length=self.n_fft,
-            hop_length=self.hop_length,
-            power=2.0,
-            mel_filters=self.mel_filters,
-            log_mel=None,
-        )
-
-        log_spec = np.log(np.clip(log_spec, a_min=1e-5, a_max=None))
-
-        if self.mel_norms is not None:
-            log_spec = log_spec / np.array(self.mel_norms)[:, None]
-
-        return log_spec
-
-    def __call__(
-        self,
-        raw_speech: np.ndarray | list[float] | list[np.ndarray] | list[list[float]],
-        sampling_rate: int | None = None,
-        truncation: bool = True,
-        pad_to_multiple_of: int | None = None,
-        return_tensors: str | TensorType | None = None,
-        return_attention_mask: bool | None = True,
-        padding: str | None = "max_length",
-        max_length: int | None = None,
-        **kwargs,
-    ) -> BatchFeature:
-        """
-        `ClvpFeatureExtractor` is used to extract various voice specific properties such as the pitch and tone of the
-        voice, speaking speed, and even speaking defects like a lisp or stuttering from a sample voice or `raw_speech`.
-
-        First the voice is padded or truncated in a way such that it becomes a waveform of `self.default_audio_length`
-        seconds long and then the log-mel spectrogram is extracted from it.
-
-        Args:
-            raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
-                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
-                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
-                stereo, i.e. single float per timestep.
-            sampling_rate (`int`, *optional*):
-                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
-                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
-                pipeline.
-            truncation (`bool`, *optional*, default to `True`):
-                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
-            pad_to_multiple_of (`int`, *optional*):
-                If set will pad the sequence to a multiple of the provided value.
-
-                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
-                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
-            return_attention_mask (`bool`, *optional*, defaults to `True`):
-                Whether to return the attention mask. If left to the default, it will return the attention mask.
-
-                [What are attention masks?](../glossary#attention-mask)
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors instead of list of python integers. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return Numpy `np.ndarray` objects.
-            padding_value (`float`, *optional*, defaults to 0.0):
-                The value that is used to fill the padding values / vectors.
-            max_length (`int`, *optional*):
-                The maximum input length of the inputs.
-        """
-
-        if sampling_rate is not None:
-            if sampling_rate != self.sampling_rate:
-                raise ValueError(
-                    f"The model corresponding to this feature extractor: {self.__class__.__name__} was trained using a"
-                    f" sampling rate of {self.sampling_rate}. Please make sure that the provided `raw_speech` input"
-                    f" was sampled with {self.sampling_rate} and not {sampling_rate}."
-                )
-        else:
-            logger.warning(
-                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
-                "Failing to do so can result in silent errors that might be hard to debug."
-            )
-
-        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
-        if is_batched_numpy and len(raw_speech.shape) > 2:
-            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
-        is_batched = is_batched_numpy or (
-            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
-        )
-
-        if is_batched:
-            raw_speech = [np.asarray([speech], dtype=np.float32).T for speech in raw_speech]
-        elif not is_batched and not isinstance(raw_speech, np.ndarray):
-            raw_speech = np.asarray(raw_speech, dtype=np.float32)
-        elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
-            raw_speech = raw_speech.astype(np.float32)
-
-        # always return batch
-        if not is_batched:
-            raw_speech = [np.asarray([raw_speech]).T]
-
-        batched_speech = BatchFeature({"input_features": raw_speech})
-
-        max_length = self.default_audio_length * self.sampling_rate if max_length is None else max_length
-
-        padded_inputs = self.pad(
-            batched_speech,
-            padding=padding,
-            max_length=max_length,
-            truncation=truncation,
-            pad_to_multiple_of=pad_to_multiple_of,
-            return_attention_mask=return_attention_mask,
-        )
-
-        # make sure list is in array format
-        input_features = padded_inputs.get("input_features").transpose(2, 0, 1)
-
-        input_features = [
-            self._np_extract_fbank_features(waveform).astype(np.float32) for waveform in input_features[0]
-        ]
-
-        if isinstance(input_features[0], list):
-            padded_inputs["input_features"] = [np.asarray(feature) for feature in input_features]
-        else:
-            padded_inputs["input_features"] = input_features
-
-        return padded_inputs.convert_to_tensors(return_tensors)
 
+ClvpFeatureExtractor = deprecated_feature_extractor(ClvpAudioProcessor, "ClvpFeatureExtractor")
 
 __all__ = ["ClvpFeatureExtractor"]
diff --git a/src/transformers/models/dac/audio_processing_dac.py b/src/transformers/models/dac/audio_processing_dac.py
new file mode 100644
index 000000000000..f0a27bd57555
--- /dev/null
+++ b/src/transformers/models/dac/audio_processing_dac.py
@@ -0,0 +1,24 @@
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...audio_processing_backends import NumpyAudioBackend
+
+
+class DacAudioProcessor(NumpyAudioBackend):
+    sample_rate = 16000
+    force_mono = True
+    add_channel_dim = True
+
+
+__all__ = ["DacAudioProcessor"]
diff --git a/src/transformers/models/dac/feature_extraction_dac.py b/src/transformers/models/dac/feature_extraction_dac.py
index 7f910f57f09f..f255d22ebba5 100644
--- a/src/transformers/models/dac/feature_extraction_dac.py
+++ b/src/transformers/models/dac/feature_extraction_dac.py
@@ -11,160 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Feature extractor class for DAC"""
+from ...utils.deprecation import deprecated_feature_extractor
+from .audio_processing_dac import DacAudioProcessor
 
-import numpy as np
-
-from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
-from ...feature_extraction_utils import BatchFeature
-from ...utils import PaddingStrategy, TensorType, logging
-
-
-logger = logging.get_logger(__name__)
-
-
-class DacFeatureExtractor(SequenceFeatureExtractor):
-    r"""
-    Constructs an Dac feature extractor.
-
-    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
-    most of the main methods. Users should refer to this superclass for more information regarding those methods.
-
-    Args:
-        feature_size (`int`, *optional*, defaults to 1):
-            The feature dimension of the extracted features. Use 1 for mono, 2 for stereo.
-        sampling_rate (`int`, *optional*, defaults to 16000):
-            The sampling rate at which the audio waveform should be digitalized, expressed in hertz (Hz).
-        padding_value (`float`, *optional*, defaults to 0.0):
-            The value that is used for padding.
-        hop_length (`int`, *optional*, defaults to 512):
-            Overlap length between successive windows.
-    """
-
-    model_input_names = ["input_values", "n_quantizers"]
-
-    def __init__(
-        self,
-        feature_size: int = 1,
-        sampling_rate: int = 16000,
-        padding_value: float = 0.0,
-        hop_length: int = 512,
-        **kwargs,
-    ):
-        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
-        self.hop_length = hop_length
-
-    def __call__(
-        self,
-        raw_audio: np.ndarray | list[float] | list[np.ndarray] | list[list[float]],
-        padding: bool | str | PaddingStrategy | None = None,
-        truncation: bool | None = False,
-        max_length: int | None = None,
-        return_tensors: str | TensorType | None = None,
-        sampling_rate: int | None = None,
-    ) -> BatchFeature:
-        """
-        Main method to featurize and prepare for the model one or several sequence(s).
-
-        Args:
-            raw_audio (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
-                The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
-                values, a list of numpy arrays or a list of list of float values. The numpy array must be of shape
-                `(num_samples,)` for mono audio (`feature_size = 1`), or `(2, num_samples)` for stereo audio
-                (`feature_size = 2`).
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
-                Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                index) among:
-
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            truncation (`bool`, *optional*, defaults to `False`):
-                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
-            max_length (`int`, *optional*):
-                Maximum length of the returned list and optionally padding length (see above).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*, default to 'pt'):
-                If set, will return tensors instead of list of python integers. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return Numpy `np.ndarray` objects.
-            sampling_rate (`int`, *optional*):
-                The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass
-                `sampling_rate` at the forward call to prevent silent errors.
-        """
-        if sampling_rate is not None:
-            if sampling_rate != self.sampling_rate:
-                raise ValueError(
-                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
-                    f" {self.sampling_rate}. Please make sure that the provided audio input was sampled with"
-                    f" {self.sampling_rate} and not {sampling_rate}."
-                )
-        else:
-            logger.warning(
-                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
-                "Failing to do so can result in silent errors that might be hard to debug."
-            )
-
-        if padding and truncation:
-            raise ValueError("Both padding and truncation were set. Make sure you only set one.")
-        elif padding is None:
-            # by default let's pad the inputs
-            padding = True
-
-        is_batched = bool(
-            isinstance(raw_audio, (list, tuple)) and (isinstance(raw_audio[0], (np.ndarray, tuple, list)))
-        )
-
-        if is_batched:
-            raw_audio = [np.asarray(audio, dtype=np.float32).T for audio in raw_audio]
-        elif not is_batched and not isinstance(raw_audio, np.ndarray):
-            raw_audio = np.asarray(raw_audio, dtype=np.float32)
-        elif isinstance(raw_audio, np.ndarray) and raw_audio.dtype is np.dtype(np.float64):
-            raw_audio = raw_audio.astype(np.float32)
-
-        # always return batch
-        if not is_batched:
-            raw_audio = [np.asarray(raw_audio).T]
-
-        # verify inputs are valid
-        for idx, example in enumerate(raw_audio):
-            if example.ndim > 2:
-                raise ValueError(f"Expected input shape (channels, length) but got shape {example.shape}")
-            if self.feature_size == 1 and example.ndim != 1:
-                raise ValueError(f"Expected mono audio but example has {example.shape[-1]} channels")
-            if self.feature_size == 2:
-                raise ValueError("Stereo audio isn't supported for now")
-
-        input_values = BatchFeature({"input_values": raw_audio})
-
-        # normal padding on batch
-        padded_inputs = self.pad(
-            input_values,
-            max_length=max_length,
-            truncation=truncation,
-            padding=padding,
-            return_attention_mask=padding,
-            pad_to_multiple_of=self.hop_length,
-        )
-        if padding:
-            padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask")
-        if padding:
-            padded_inputs.input_values = padded_inputs.input_values[:, np.newaxis, :]
-
-        input_values = []
-        for example in padded_inputs.pop("input_values"):
-            if self.feature_size == 1:
-                example = example[..., None]
-            input_values.append(example.T)
-
-        padded_inputs["input_values"] = input_values
-        if return_tensors is not None:
-            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
-
-        return padded_inputs
 
+DacFeatureExtractor = deprecated_feature_extractor(DacAudioProcessor, "DacFeatureExtractor")
 
 __all__ = ["DacFeatureExtractor"]
diff --git a/src/transformers/models/dia/audio_processing_dia.py b/src/transformers/models/dia/audio_processing_dia.py
new file mode 100644
index 000000000000..e1b7b0301e71
--- /dev/null
+++ b/src/transformers/models/dia/audio_processing_dia.py
@@ -0,0 +1,25 @@
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...audio_processing_backends import NumpyAudioBackend
+
+
+class DiaAudioProcessor(NumpyAudioBackend):
+    sample_rate = 44100
+    force_mono = True
+    add_channel_dim = True
+    pad_to_multiple_of = 512
+
+
+__all__ = ["DiaAudioProcessor"]
diff --git a/src/transformers/models/dia/feature_extraction_dia.py b/src/transformers/models/dia/feature_extraction_dia.py
index eda1ead6e014..d358589b4282 100644
--- a/src/transformers/models/dia/feature_extraction_dia.py
+++ b/src/transformers/models/dia/feature_extraction_dia.py
@@ -11,169 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Feature extractor class for Dia"""
+from ...utils.deprecation import deprecated_feature_extractor
+from .audio_processing_dia import DiaAudioProcessor
 
-import numpy as np
-
-from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
-from ...feature_extraction_utils import BatchFeature
-from ...utils import PaddingStrategy, TensorType, logging
-
-
-logger = logging.get_logger(__name__)
-
-
-class DiaFeatureExtractor(SequenceFeatureExtractor):
-    r"""
-    Constructs an Dia feature extractor.
-
-    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
-    most of the main methods. Users should refer to this superclass for more information regarding those methods.
-
-    Args:
-        feature_size (`int`, *optional*, defaults to 1):
-            The feature dimension of the extracted features. Use 1 for mono, 2 for stereo.
-        sampling_rate (`int`, *optional*, defaults to 16000):
-            The sampling rate at which the audio waveform should be digitalized, expressed in hertz (Hz).
-        padding_value (`float`, *optional*, defaults to 0.0):
-            The value that is used for padding.
-        hop_length (`int`, *optional*, defaults to 512):
-            Overlap length between successive windows.
-    """
-
-    model_input_names = ["input_values", "n_quantizers"]
-
-    def __init__(
-        self,
-        feature_size: int = 1,
-        sampling_rate: int = 16000,
-        padding_value: float = 0.0,
-        hop_length: int = 512,
-        **kwargs,
-    ):
-        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
-        self.hop_length = hop_length
-
-    def __call__(
-        self,
-        raw_audio: np.ndarray | list[float] | list[np.ndarray] | list[list[float]],
-        padding: bool | str | PaddingStrategy | None = None,
-        truncation: bool | None = False,
-        max_length: int | None = None,
-        return_tensors: str | TensorType | None = None,
-        sampling_rate: int | None = None,
-    ) -> BatchFeature:
-        """
-        Main method to featurize and prepare for the model one or several sequence(s).
-
-        Args:
-            raw_audio (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
-                The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
-                values, a list of numpy arrays or a list of list of float values. The numpy array must be of shape
-                `(num_samples,)` for mono audio (`feature_size = 1`), or `(2, num_samples)` for stereo audio
-                (`feature_size = 2`).
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
-                Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                index) among:
-
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            truncation (`bool`, *optional*, defaults to `False`):
-                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
-            max_length (`int`, *optional*):
-                Maximum length of the returned list and optionally padding length (see above).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*, default to 'pt'):
-                If set, will return tensors instead of list of python integers. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return Numpy `np.ndarray` objects.
-            sampling_rate (`int`, *optional*):
-                The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass
-                `sampling_rate` at the forward call to prevent silent errors.
-        """
-        if sampling_rate is not None:
-            if sampling_rate != self.sampling_rate:
-                raise ValueError(
-                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
-                    f" {self.sampling_rate}. Please make sure that the provided audio input was sampled with"
-                    f" {self.sampling_rate} and not {sampling_rate}."
-                )
-        else:
-            logger.warning(
-                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
-                "Failing to do so can result in silent errors that might be hard to debug."
-            )
-
-        if padding and truncation:
-            raise ValueError("Both padding and truncation were set. Make sure you only set one.")
-        elif padding is None:
-            # by default let's pad the inputs
-            padding = True
-
-        is_batched = bool(
-            isinstance(raw_audio, (list, tuple)) and (isinstance(raw_audio[0], (np.ndarray, tuple, list)))
-        )
-
-        if is_batched:
-            raw_audio = [np.asarray(audio, dtype=np.float32).T for audio in raw_audio]
-        elif not is_batched and not isinstance(raw_audio, np.ndarray):
-            raw_audio = np.asarray(raw_audio, dtype=np.float32)
-        elif isinstance(raw_audio, np.ndarray) and raw_audio.dtype is np.dtype(np.float64):
-            raw_audio = raw_audio.astype(np.float32)
-
-        # always return batch
-        if not is_batched:
-            raw_audio = [np.asarray(raw_audio).T]
-
-        # convert stereo to mono if necessary, unique to Dia
-        for idx, example in enumerate(raw_audio):
-            if self.feature_size == 2 and example.ndim == 2:
-                raw_audio[idx] = np.mean(example, -1)
-
-        # verify inputs are valid
-        for idx, example in enumerate(raw_audio):
-            if example.ndim > 2:
-                raise ValueError(f"Expected input shape (channels, length) but got shape {example.shape}")
-            if self.feature_size == 1 and example.ndim != 1:
-                raise ValueError(f"Expected mono audio but example has {example.shape[-1]} channels")
-            if self.feature_size == 2 and example.ndim != 1:  # note the conversion before
-                raise ValueError(f"Expected stereo audio but example has {example.shape[-1]} channels")
-
-        input_values = BatchFeature({"input_values": raw_audio})
-
-        # temporarily treat it as if we were mono as we also convert stereo to mono
-        original_feature_size = self.feature_size
-        self.feature_size = 1
-
-        # normal padding on batch
-        padded_inputs = self.pad(
-            input_values,
-            max_length=max_length,
-            truncation=truncation,
-            padding=padding,
-            return_attention_mask=True,
-            pad_to_multiple_of=self.hop_length,
-        )
-        padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask")
-
-        input_values = []
-        for example in padded_inputs.pop("input_values"):
-            if self.feature_size == 1:
-                example = example[..., None]
-            input_values.append(example.T)
-
-        padded_inputs["input_values"] = input_values
-        if return_tensors is not None:
-            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
-
-        # rewrite back to original feature size
-        self.feature_size = original_feature_size
-
-        return padded_inputs
 
+DiaFeatureExtractor = deprecated_feature_extractor(DiaAudioProcessor, "DiaFeatureExtractor")
 
 __all__ = ["DiaFeatureExtractor"]
diff --git a/src/transformers/models/encodec/audio_processing_encodec.py b/src/transformers/models/encodec/audio_processing_encodec.py
new file mode 100644
index 000000000000..022a7e145313
--- /dev/null
+++ b/src/transformers/models/encodec/audio_processing_encodec.py
@@ -0,0 +1,24 @@
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...audio_processing_backends import NumpyAudioBackend
+
+
+class EncodecAudioProcessor(NumpyAudioBackend):
+    sample_rate = 24000
+    force_mono = True
+    add_channel_dim = True
+
+
+__all__ = ["EncodecAudioProcessor"]
diff --git a/src/transformers/models/encodec/feature_extraction_encodec.py b/src/transformers/models/encodec/feature_extraction_encodec.py
index 383936000243..2f1644ac912a 100644
--- a/src/transformers/models/encodec/feature_extraction_encodec.py
+++ b/src/transformers/models/encodec/feature_extraction_encodec.py
@@ -11,195 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Feature extractor class for EnCodec."""
+from ...utils.deprecation import deprecated_feature_extractor
+from .audio_processing_encodec import EncodecAudioProcessor
 
-import numpy as np
-
-from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
-from ...feature_extraction_utils import BatchFeature
-from ...utils import PaddingStrategy, TensorType, logging
-
-
-logger = logging.get_logger(__name__)
-
-
-class EncodecFeatureExtractor(SequenceFeatureExtractor):
-    r"""
-    Constructs an EnCodec feature extractor.
-
-    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
-    most of the main methods. Users should refer to this superclass for more information regarding those methods.
-
-    Instantiating a feature extractor with the defaults will yield a similar configuration to that of the
-    [facebook/encodec_24khz](https://huggingface.co/facebook/encodec_24khz) architecture.
-
-    Args:
-        feature_size (`int`, *optional*, defaults to 1):
-            The feature dimension of the extracted features. Use 1 for mono, 2 for stereo.
-        sampling_rate (`int`, *optional*, defaults to 24000):
-            The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).
-        padding_value (`float`, *optional*, defaults to 0.0):
-            The value that is used to fill the padding values.
-        chunk_length_s (`float`, *optional*):
-            If defined the audio is pre-processed into chunks of lengths `chunk_length_s` and then encoded.
-        overlap (`float`, *optional*):
-            Defines the overlap between each chunk. It is used to compute the `chunk_stride` using the following
-            formulae : `int((1.0 - self.overlap) * self.chunk_length)`.
-    """
-
-    model_input_names = ["input_values", "padding_mask"]
-
-    def __init__(
-        self,
-        feature_size: int = 1,
-        sampling_rate: int = 24000,
-        padding_value: float = 0.0,
-        chunk_length_s: float | None = None,
-        overlap: float | None = None,
-        **kwargs,
-    ):
-        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
-        self.chunk_length_s = chunk_length_s
-        self.overlap = overlap
-
-    # This is a property because you might want to change the chunk_length_s on the fly
-    @property
-    def chunk_length(self) -> int | None:
-        if self.chunk_length_s is None:
-            return None
-        else:
-            return int(self.chunk_length_s * self.sampling_rate)
-
-    # This is a property because you might want to change the chunk_length_s on the fly
-    @property
-    def chunk_stride(self) -> int | None:
-        if self.chunk_length_s is None or self.overlap is None:
-            return None
-        else:
-            return max(1, int((1.0 - self.overlap) * self.chunk_length))
-
-    def __call__(
-        self,
-        raw_audio: np.ndarray | list[float] | list[np.ndarray] | list[list[float]],
-        padding: bool | str | PaddingStrategy | None = None,
-        truncation: bool | None = False,
-        max_length: int | None = None,
-        return_tensors: str | TensorType | None = None,
-        sampling_rate: int | None = None,
-    ) -> BatchFeature:
-        """
-        Main method to featurize and prepare for the model one or several sequence(s).
-
-        Args:
-            raw_audio (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
-                The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
-                values, a list of numpy arrays or a list of list of float values. The numpy array must be of shape
-                `(num_samples,)` for mono audio (`feature_size = 1`), or `(2, num_samples)` for stereo audio
-                (`feature_size = 2`).
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
-                Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                index) among:
-
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            truncation (`bool`, *optional*, defaults to `False`):
-                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
-            max_length (`int`, *optional*):
-                Maximum length of the returned list and optionally padding length (see above).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors instead of list of python integers. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return Numpy `np.ndarray` objects.
-            sampling_rate (`int`, *optional*):
-                The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass
-                `sampling_rate` at the forward call to prevent silent errors.
-        """
-        if sampling_rate is not None:
-            if sampling_rate != self.sampling_rate:
-                raise ValueError(
-                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
-                    f" {self.sampling_rate}. Please make sure that the provided audio input was sampled with"
-                    f" {self.sampling_rate} and not {sampling_rate}."
-                )
-        else:
-            logger.warning(
-                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
-                "Failing to do so can result in silent errors that might be hard to debug."
-            )
-
-        if padding and truncation:
-            raise ValueError("Both padding and truncation were set. Make sure you only set one.")
-        elif padding is None:
-            # by default let's pad the inputs
-            padding = True
-
-        is_batched = bool(
-            isinstance(raw_audio, (list, tuple)) and (isinstance(raw_audio[0], (np.ndarray, tuple, list)))
-        )
-
-        if is_batched:
-            raw_audio = [np.asarray(audio, dtype=np.float32).T for audio in raw_audio]
-        elif not is_batched and not isinstance(raw_audio, np.ndarray):
-            raw_audio = np.asarray(raw_audio, dtype=np.float32)
-        elif isinstance(raw_audio, np.ndarray) and raw_audio.dtype is np.dtype(np.float64):
-            raw_audio = raw_audio.astype(np.float32)
-
-        # always return batch
-        if not is_batched:
-            raw_audio = [np.asarray(raw_audio).T]
-
-        # verify inputs are valid
-        for idx, example in enumerate(raw_audio):
-            if example.ndim > 2:
-                raise ValueError(f"Expected input shape (channels, length) but got shape {example.shape}")
-            if self.feature_size == 1 and example.ndim != 1:
-                raise ValueError(f"Expected mono audio but example has {example.shape[-1]} channels")
-            if self.feature_size == 2 and example.shape[-1] != 2:
-                raise ValueError(f"Expected stereo audio but example has {example.shape[-1]} channels")
-
-        padded_inputs = None
-        input_values = BatchFeature({"input_values": raw_audio})
-        if self.chunk_stride is not None and self.chunk_length is not None and max_length is None:
-            if truncation:
-                max_length = min(array.shape[0] for array in raw_audio)
-                nb_step = int(np.floor(max_length / self.chunk_stride))
-                max_length = (nb_step - 1) * self.chunk_stride + self.chunk_length
-            elif padding:
-                max_length = max(array.shape[0] for array in raw_audio)
-                nb_step = int(np.ceil(max_length / self.chunk_stride))
-                max_length = (nb_step - 1) * self.chunk_stride + self.chunk_length
-                padding = "max_length"
-            else:
-                padded_inputs = input_values
-
-        # normal padding on batch
-        if padded_inputs is None:
-            padded_inputs = self.pad(
-                input_values,
-                max_length=max_length,
-                truncation=truncation,
-                padding=padding,
-                return_attention_mask=padding,
-            )
-            if padding:
-                padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask")
-
-        input_values = []
-        for example in padded_inputs.pop("input_values"):
-            if self.feature_size == 1:
-                example = example[..., None]
-            input_values.append(example.T)
-
-        padded_inputs["input_values"] = input_values
-        if return_tensors is not None:
-            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
-
-        return padded_inputs
 
+EncodecFeatureExtractor = deprecated_feature_extractor(EncodecAudioProcessor, "EncodecFeatureExtractor")
 
 __all__ = ["EncodecFeatureExtractor"]
diff --git a/src/transformers/models/gemma3n/audio_processing_gemma3n.py b/src/transformers/models/gemma3n/audio_processing_gemma3n.py
new file mode 100644
index 000000000000..23f63b8bdb19
--- /dev/null
+++ b/src/transformers/models/gemma3n/audio_processing_gemma3n.py
@@ -0,0 +1,139 @@
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ...audio_processing_backends import NumpyAudioBackend
+from ...audio_utils import MelScaleConfig, SpectrogramConfig, StftConfig
+
+
+def _unfold(array, dimension, size, step):
+    """NumPy equivalent of PyTorch's unfold for 2D arrays along the last dim."""
+    if array.ndim == 1:
+        array = array[np.newaxis, :]
+    batch_size, original_length = array.shape
+    num_frames = (original_length - size) // step + 1
+    if num_frames <= 0:
+        return np.zeros((batch_size, 0, size), dtype=array.dtype)
+    output_shape = (batch_size, num_frames, size)
+    output_strides = (array.strides[0], array.strides[1] * step, array.strides[1])
+    return np.lib.stride_tricks.as_strided(array, shape=output_shape, strides=output_strides)
+
+
+class Gemma3nAudioProcessor(NumpyAudioBackend):
+    sample_rate = 16000
+    force_mono = True
+    max_length = 480000  # 30 seconds
+    truncation = True
+    pad_to_multiple_of = 128
+    preemphasis_htk_flavor = True
+
+    # n_fft = 1024 (512 frame_length → next power of 2 → 512 → ×2 fft_overdrive)
+    spectrogram_config = SpectrogramConfig(
+        stft_config=StftConfig(
+            n_fft=1024,
+            win_length=512,
+            hop_length=160,
+            power=1.0,
+            center=False,
+        ),
+        mel_scale_config=MelScaleConfig(
+            n_mels=128,
+            f_min=125.0,
+            f_max=7600.0,
+            mel_scale="htk",
+            matmul_order="features_first",
+        ),
+        mel_floor=1e-5,
+        log_mode="log",
+        preemphasis=0.97,
+        computation_dtype="float64",
+    )
+
+    def __init__(self, per_bin_mean=None, per_bin_stddev=None, **kwargs):
+        super().__init__(**kwargs)
+
+        # Pre-compute window in float32 to match the upstream FE exactly
+        win_length = self.spectrogram_config.stft_config.win_length
+        hann_arange = np.arange(win_length, dtype=np.float32)
+        self.window = (0.5 * (1 - np.cos(2 * np.pi * hann_arange / win_length))).astype(np.float32)
+
+        n_mels = self.spectrogram_config.mel_scale_config.n_mels
+        if per_bin_mean is not None:
+            self.per_bin_mean = np.array(per_bin_mean).reshape(1, n_mels)
+        else:
+            self.per_bin_mean = None
+
+        if per_bin_stddev is not None:
+            self.per_bin_stddev = np.array(per_bin_stddev).reshape(1, n_mels)
+        else:
+            self.per_bin_stddev = None
+
+    def _apply_frame_processing(self, frames, *, spectrogram_config, **kwargs):
+        """HTK-style preemphasis on frames extracted with an extra sample."""
+        preemphasis = spectrogram_config.preemphasis
+        if preemphasis is not None and preemphasis > 0.0:
+            if self.preemphasis_htk_flavor:
+                first = frames[..., :1] * (1.0 - preemphasis)
+                rest = frames[..., 1:-1] - preemphasis * frames[..., :-2]
+                return np.concatenate([first, rest], axis=-1)
+            else:
+                return frames[..., 1:] - preemphasis * frames[..., :-1]
+        return frames[..., :-1]
+
+    def _stft(self, audio, *, spectrogram_config, **kwargs):
+        """Unfold-based STFT with extra-sample framing for HTK preemphasis.
+
+        Extracts frames of win_length+1 so that _apply_frame_processing can
+        reduce them to win_length after HTK preemphasis. Returns (batch, time, freq).
+        """
+        stft_cfg = spectrogram_config.stft_config
+
+        frame_size_for_unfold = stft_cfg.win_length + 1
+        frames = _unfold(audio, dimension=-1, size=frame_size_for_unfold, step=stft_cfg.hop_length)
+
+        frames = self._apply_frame_processing(frames, spectrogram_config=spectrogram_config, **kwargs)
+
+        frames = frames * self.window
+        stft = np.fft.rfft(frames, n=stft_cfg.n_fft, axis=-1)
+        return np.abs(stft)
+
+    def _normalize_magnitude(self, features, *, spectrogram_config, **kwargs):
+        """Apply log compression and per-bin normalization."""
+        result = super()._normalize_magnitude(features, spectrogram_config=spectrogram_config, **kwargs)
+
+        if self.per_bin_mean is not None:
+            result = result - self.per_bin_mean
+        if self.per_bin_stddev is not None:
+            result = result / self.per_bin_stddev
+
+        return result.astype(np.float32)
+
+    def _get_features_lengths(self, audio_lengths, spectrogram_config, include_center_frame=False):
+        """Frame count matching the FE's downsampled attention mask approach.
+
+        The upstream FE computes the mask by slicing the sample-level attention
+        mask every hop_length steps, which yields ceil(audio_length / hop_length)
+        valid frames rather than the unfold-based count.
+        """
+        hop_length = spectrogram_config.stft_config.hop_length
+        if include_center_frame:
+            # For padded length we still use the unfold formula to get total frames
+            frame_size = spectrogram_config.stft_config.win_length + 1
+            return (audio_lengths - frame_size) // hop_length + 1
+        # Match FE: attention_mask[::hop_length] gives this many valid entries
+        return (audio_lengths + hop_length - 1) // hop_length
+
+
+__all__ = ["Gemma3nAudioProcessor"]
diff --git a/src/transformers/models/gemma3n/feature_extraction_gemma3n.py b/src/transformers/models/gemma3n/feature_extraction_gemma3n.py
index e2b24fb1f19f..1b111b76b49d 100644
--- a/src/transformers/models/gemma3n/feature_extraction_gemma3n.py
+++ b/src/transformers/models/gemma3n/feature_extraction_gemma3n.py
@@ -11,323 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from ...utils.deprecation import deprecated_feature_extractor
+from .audio_processing_gemma3n import Gemma3nAudioProcessor
 
-import math
-from collections.abc import Sequence
-
-import numpy as np
-
-from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
-from ...feature_extraction_utils import BatchFeature
-from ...utils import PaddingStrategy, TensorType, logging
-
-
-logger = logging.get_logger(__name__)
-
-
-def create_fb_matrix(
-    n_freqs: int,
-    f_min: float,
-    f_max: float,
-    n_mels: int,
-    sample_rate: int,
-    fft_length: int,
-    norm: str | None = None,
-) -> np.ndarray:
-    r"""Create a frequency bin conversion matrix (NumPy version).
-
-    Args:
-        n_freqs (int): Number of frequencies to highlight/apply
-        f_min (float): Minimum frequency (Hz)
-        f_max (float): Maximum frequency (Hz)
-        n_mels (int): Number of mel filterbanks
-        sample_rate (int): Sample rate of the audio waveform
-        fft_length (int): FFT length
-        norm (Optional[str]): If 'slaney', divide the triangular mel weights by
-          the width of the mel band (area normalization). (Default: ``None``)
-
-    Returns:
-        np.ndarray: Triangular filter banks (fb matrix) of size (``n_freqs``,
-        ``n_mels``)
-        meaning number of frequencies to highlight/apply to x the number of
-        filterbanks.
-        Each column is a filterbank so that assuming there is a matrix A of
-        size (..., ``n_freqs``), the applied result would be
-        ``A @ create_fb_matrix_numpy(A.shape[-1], ...)``.
-    """
-
-    if norm is not None and norm != "slaney":
-        raise ValueError("norm must be one of None or 'slaney'")
-
-    # freq bins
-    all_freqs = np.arange(n_freqs, dtype=np.float32) * (sample_rate / fft_length)
-
-    # calculate mel freq bins
-    # hertz to mel(f) is 2595. * math.log10(1. + (f / 700.))
-    m_min = 2595.0 * math.log10(1.0 + (f_min / 700.0))
-    m_max = 2595.0 * math.log10(1.0 + (f_max / 700.0))
-    m_pts = np.linspace(m_min, m_max, n_mels + 2)
-    # mel to hertz(mel) is 700. * (10**(mel / 2595.) - 1.)
-    f_pts = 700.0 * (10 ** (m_pts / 2595.0) - 1.0)
-    # calculate difference between each mel point and each stft freq point in Hz
-    f_diff = f_pts[1:] - f_pts[:-1]  # (n_mels + 1)
-    slopes = np.expand_dims(f_pts, 0) - np.expand_dims(all_freqs, 1)  # (n_freqs, n_mels + 2)
-    # create overlapping triangles
-    zero = np.zeros(1, dtype=np.float32)
-    down_slopes = (-1.0 * slopes[:, :-2]) / f_diff[:-1]  # (n_freqs, n_mels)
-    up_slopes = slopes[:, 2:] / f_diff[1:]  # (n_freqs, n_mels)
-    fb = np.maximum(zero, np.minimum(down_slopes, up_slopes))
-
-    if norm is not None and norm == "slaney":
-        # Slaney-style mel is scaled to be approx constant energy per channel
-        enorm = 2.0 / (f_pts[2 : n_mels + 2] - f_pts[:n_mels])
-        fb *= np.expand_dims(enorm, 0)
-
-    return fb
-
-
-def _unfold(array: np.ndarray, dimension: int, size: int, step: int) -> np.ndarray:
-    """A basic NumPy equivalent of PyTorch's unfold for 2D arrays along the last dim."""
-    if array.ndim != 2:
-        raise ValueError("This unfold implementation currently supports 2D arrays (batch, time).")
-    if dimension != -1 and dimension != array.ndim - 1:
-        raise ValueError("This unfold implementation only supports unfolding the last dimension.")
-
-    batch_size, original_length = array.shape
-    num_frames = (original_length - size) // step + 1
-
-    if num_frames <= 0:
-        return np.zeros((batch_size, 0, size), dtype=array.dtype)
-
-    output_shape = (batch_size, num_frames, size)
-    output_strides = (array.strides[0], array.strides[1] * step, array.strides[1])
-
-    return np.lib.stride_tricks.as_strided(array, shape=output_shape, strides=output_strides)
-
-
-class Gemma3nAudioFeatureExtractor(SequenceFeatureExtractor):
-    """An audio feature extractor Universal Speech Models https://huggingface.co/papers/2303.01037.
-
-    Args:
-        feature_size (`int`, *optional*, defaults to 128):
-            The feature dimension of the extracted features.
-        sampling_rate (`int`, *optional*, defaults to 16000):
-            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
-        padding_value (`float`, *optional*, defaults to 0.0):
-            Padding value used to pad the audio. Should correspond to silences.
-        return_attention_mask (`bool`, *optional*, defaults to `True`):
-            Whether to return the attention mask for the generated MEL spectrograms.
-        frame_length_ms (`float`, *optional*, defaults to 32.0):
-            The length of a frame in milliseconds.
-        hop_length_ms (`float`, *optional*, defaults to 10.0):
-            Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
-        min_frequency (`float`, *optional*, defaults to 125.0):
-            The minimum frequency (in Hz) for the Mel filterbank.
-        max_frequency (`float`, *optional*, defaults to 7600.0):
-            The maximum frequency (in Hz) for the Mel filterbank.
-        preemphasis (`float`, *optional*, defaults to 0.97):
-            The preemphasis coefficient.
-        preemphasis_htk_flavor (`bool`, *optional*, defaults to `True`):
-            Whether to use HTK-style preemphasis.
-        fft_overdrive (`bool`, *optional*, defaults to `True`):
-            Whether to use FFT overdrive.
-        dither (`float`, *optional*, defaults to 0.0):
-            Adds dithering. In other words, adds a small Gaussian noise to each frame.
-            E.g. use 0.0001 to add dithering with a normal distribution centered
-            around 0.0 with standard deviation 0.0001 (assuming [-1,+1] range of raw_speech).
-            The value 0.0 means no dithering.
-            Dithering has similar effect as `spectrogram(mel_floor=...)`. It reduces
-            the high log_mel_fbank values for signals with hard-zero sections,
-            when VAD cutoff is present in the signal.
-        input_scale_factor (`float`, *optional*, defaults to 1.0):
-            Scaling factor applied to the input waveform.
-        mel_floor (`float`, *optional*, defaults to 1e-05):
-            Minimum value for Mel spectrograms to avoid log(0).
-        per_bin_mean (`Optional[Sequence[float]]`, *optional*):
-            Mean values for per-bin normalization.
-        per_bin_stddev (`Optional[Sequence[float]]`, *optional*):
-            Standard deviation values for per-bin normalization.
-    """
-
-    model_input_names = ["input_features", "input_features_mask"]
-
-    def __init__(
-        self,
-        feature_size: int = 128,
-        sampling_rate: int = 16_000,
-        padding_value: float = 0.0,
-        return_attention_mask: bool = True,
-        frame_length_ms: float = 32.0,
-        hop_length_ms: float = 10.0,
-        min_frequency: float = 125.0,
-        max_frequency: float = 7600.0,
-        preemphasis: float = 0.97,
-        preemphasis_htk_flavor: bool = True,
-        fft_overdrive: bool = True,
-        dither: float = 0.0,
-        input_scale_factor: float = 1.0,
-        mel_floor: float = 1e-5,
-        per_bin_mean: Sequence[float] | None = None,
-        per_bin_stddev: Sequence[float] | None = None,
-        **kwargs,
-    ):
-        super().__init__(
-            feature_size=feature_size,
-            sampling_rate=sampling_rate,
-            padding_value=padding_value,
-            return_attention_mask=return_attention_mask,
-            **kwargs,
-        )
-
-        self.min_frequency = min_frequency
-        self.max_frequency = max_frequency
-        self.preemphasis = preemphasis
-        self.preemphasis_htk_flavor = preemphasis_htk_flavor
-        self.fft_overdrive = fft_overdrive
-        self.dither = dither
-        self.input_scale_factor = input_scale_factor
-        self.frame_length = int(round(sampling_rate * frame_length_ms / 1000.0))
-        self.hop_length = int(round(sampling_rate * hop_length_ms / 1000.0))
-        self.mel_floor = np.array(mel_floor, dtype=np.float64)
-
-        fft_length = 2 ** math.ceil(math.log2(self.frame_length))
-        if self.fft_overdrive:
-            fft_length *= 2
-        self.fft_length = fft_length
-
-        hann_arange = np.arange(self.frame_length, dtype=np.float32)
-        window = 0.5 * (1 - np.cos(2 * np.pi * hann_arange / self.frame_length))
-        self.window = window.astype(np.float32)
-
-        self.mel_filters = create_fb_matrix(
-            n_freqs=self.fft_length // 2 + 1,
-            f_min=min_frequency,
-            f_max=max_frequency,
-            n_mels=feature_size,
-            sample_rate=self.sampling_rate,
-            norm=None,
-            fft_length=fft_length,
-        )
-
-        if per_bin_mean is not None:
-            self.per_bin_mean = np.array(per_bin_mean).reshape(1, 1, feature_size)
-        else:
-            self.per_bin_mean = None
-
-        if per_bin_stddev is not None:
-            self.per_bin_stddev = np.array(per_bin_stddev).reshape(1, 1, feature_size)
-        else:
-            self.per_bin_stddev = None
-
-    def _extract_spectrogram(self, waveform: np.ndarray, attention_mask: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
-        """"""
-        if waveform.ndim == 1:  # If single waveform, add batch dimension
-            waveform = np.expand_dims(waveform, axis=0)
-
-        if self.dither > 0.0:
-            waveform = waveform + self.dither * np.random.randn(*waveform.shape).astype(waveform.dtype)
-
-        if self.input_scale_factor != 1.0:
-            waveform = waveform * self.input_scale_factor
-
-        frame_size_for_unfold = self.frame_length + 1
-
-        # NumPy equivalent of unfold for [B, NumFrames, frame_size_for_unfold]
-        frames_to_process = _unfold(waveform, dimension=-1, size=frame_size_for_unfold, step=self.hop_length)
-
-        if self.preemphasis > 0.0:
-            if self.preemphasis_htk_flavor:
-                first_in_frame = frames_to_process[..., :1] * (1.0 - self.preemphasis)
-                rest_in_frame = frames_to_process[..., 1:-1] - self.preemphasis * frames_to_process[..., :-2]
-                frames = np.concatenate([first_in_frame, rest_in_frame], axis=-1)
-            else:
-                frames = frames_to_process[..., 1:] - self.preemphasis * frames_to_process[..., :-1]
-        else:
-            frames = frames_to_process[..., :-1]
-
-        frames = frames * self.window  # Broadcasting window
-        stft = np.fft.rfft(frames, n=self.fft_length, axis=-1)
-
-        magnitude_spec = np.abs(stft)
-
-        mel_spec = np.matmul(magnitude_spec, self.mel_filters)
-        log_mel_spec = np.log(np.maximum(mel_spec, self.mel_floor))
-
-        if self.per_bin_mean is not None:
-            log_mel_spec = log_mel_spec - self.per_bin_mean  # Broadcasting
-
-        if self.per_bin_stddev is not None:
-            log_mel_spec = log_mel_spec / self.per_bin_stddev  # Broadcasting
-
-        mel_spectrogram = log_mel_spec.squeeze(0)
-        mask = attention_mask[:: self.hop_length].astype(bool)
-        # TODO: The filtered mask is always exactly 3 elements longer than the mel_spectrogram. Why???
-        return mel_spectrogram, mask[: mel_spectrogram.shape[0]]
-
-    def __call__(
-        self,
-        raw_speech: np.ndarray | list[float] | list[np.ndarray] | list[list[float]],
-        padding: bool | str | PaddingStrategy = "longest",
-        max_length: int | None = 480_000,
-        truncation: bool = True,
-        pad_to_multiple_of: int | None = 128,
-        return_tensors: str | TensorType | None = None,
-        return_attention_mask: bool | None = True,
-        **kwargs,
-    ) -> BatchFeature:
-        """Creates a batch of MEL spectrograms from the provided raw speech.
-
-        This implementation uses a different algorithm for windowing and preemphasis compared to the built-in
-        `transformers.audio_utils.spectrogram()` function that _will_ result in different outputs. Consider this
-        carefully when selecting an audio feature extractor, especially with pre-trained models.
-
-        Args:
-            raw_speech:
-                The audio for which MEL spectrograms are created.
-            padding (`Union[bool, str, PaddingStrategy]`, *optional*, defaults to `"longest"`):
-                The padding strategy to use for batches of audio with different lengths.
-            max_length (`int`, *optional*, defaults to 480000):
-                If provided, defines the maximum length of the audio to allow. Audio longer than this will be
-                truncated if `truncation=True`.
-            truncation (`bool`, *optional*, defaults to `True`):
-                Whether or not to truncate audio above `max_length`.
-            pad_to_multiple_of (`int`, *optional*, defaults to 128):
-                When padding, pad to a multiple of this value. The default value is defined for optimal TPU support.
-            return_tensors (`Union[str, TensorType]`, *optional*, defaults to `None`):
-                The type of tensors to return (e.g., NumPy, or Torch).
-            return_attention_mask (`bool`, *optional*, defaults to `True`):
-                Whether to return the attention mask for the generated MEL spectrograms.
-        """
-
-        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
-        is_batched_sequence = isinstance(raw_speech, Sequence) and isinstance(raw_speech[0], (np.ndarray, Sequence))
-        is_batched = is_batched_numpy or is_batched_sequence
-
-        # Always return a batch
-        if not is_batched:
-            raw_speech = [raw_speech]
-        raw_speech = [np.asarray([rs]).T for rs in raw_speech]
-
-        batched_speech = self.pad(
-            BatchFeature({"input_features": raw_speech}),
-            padding=padding,
-            max_length=max_length,
-            truncation=truncation,
-            pad_to_multiple_of=pad_to_multiple_of,
-            return_attention_mask=return_attention_mask,
-        )
-
-        prepared_speech = []
-        prepared_speech_mask = []
-        for speech, mask in zip(batched_speech.input_features, batched_speech.attention_mask):
-            speech, mask = self._extract_spectrogram(speech.T, mask)
-            prepared_speech.append(speech.astype(np.float32))
-            prepared_speech_mask.append(mask)
-
-        return BatchFeature(
-            {"input_features": prepared_speech, "input_features_mask": prepared_speech_mask},
-            tensor_type=return_tensors,
-        )
 
+Gemma3nAudioFeatureExtractor = deprecated_feature_extractor(Gemma3nAudioProcessor, "Gemma3nAudioFeatureExtractor")
 
 __all__ = ["Gemma3nAudioFeatureExtractor"]
diff --git a/src/transformers/models/granite_speech/audio_processing_granite_speech.py b/src/transformers/models/granite_speech/audio_processing_granite_speech.py
new file mode 100644
index 000000000000..98915a5afeb9
--- /dev/null
+++ b/src/transformers/models/granite_speech/audio_processing_granite_speech.py
@@ -0,0 +1,85 @@
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import torch
+
+from ...audio_processing_backends import TorchAudioBackend
+
+
+class GraniteSpeechAudioProcessor(TorchAudioBackend):
+    sample_rate = 16000
+    force_mono = True
+    return_padding_mask = False
+    do_extract_spectrogram = True
+    projector_window_size = 15
+    projector_downsample_rate = 5
+    n_fft = 512
+    win_length = 400
+    hop_length = 160
+    n_mels = 80
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        import torchaudio
+
+        self.mel_filters_transform = torchaudio.transforms.MelSpectrogram(
+            sample_rate=self.sample_rate,
+            n_fft=self.n_fft,
+            win_length=self.win_length,
+            hop_length=self.hop_length,
+            n_mels=self.n_mels,
+        )
+
+    def extract_spectrogram(self, audio, **kwargs):
+        # Use torchaudio MelSpectrogram to match upstream FE exactly
+        melspec = self.mel_filters_transform.to(device=audio.device)
+        with torch.no_grad():
+            mel = melspec(audio.float())
+            logmel = mel.transpose(-1, -2).clip_(min=1e-10).log10_()
+            mx = logmel.amax(dim=(-2, -1), keepdim=True)
+            logmel = torch.maximum(logmel, mx - 8.0).div_(4).add_(1)
+            # Remove last frame if odd
+            if logmel.shape[1] % 2 == 1:
+                logmel = logmel[:, :-1]
+            # Stacking by 2
+            features = logmel.reshape(audio.shape[0], -1, 2 * logmel.shape[-1])
+        return features
+
+    def _postprocess_output(self, output, audio_ranges=None, **kwargs):
+        hop_length = self.hop_length
+
+        # Compute audio_embed_sizes from original audio lengths
+        effective_window_size = self.projector_window_size // self.projector_downsample_rate
+        audio_embed_sizes = []
+        for start, end in audio_ranges:
+            raw_length = end - start
+            mel_length = raw_length // hop_length + 1
+            encoder_length = mel_length // 2
+            nblocks = math.ceil(encoder_length / self.projector_window_size)
+            projector_length = nblocks * effective_window_size
+            audio_embed_sizes.append(projector_length)
+
+        # Build input_features_mask matching the FE
+        input_features_mask = torch.arange(max(audio_embed_sizes)).view(1, -1) < torch.tensor(
+            audio_embed_sizes
+        ).view(-1, 1)
+
+        output["audio_embed_sizes"] = audio_embed_sizes
+        output["audio_features_mask"] = input_features_mask
+        return output
+
+
+__all__ = ["GraniteSpeechAudioProcessor"]
diff --git a/src/transformers/models/granite_speech/feature_extraction_granite_speech.py b/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
index cd32d0433bae..15bab8e6466f 100644
--- a/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
+++ b/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
@@ -11,174 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Feature extractor class for Granite Speech."""
+from ...utils.deprecation import deprecated_feature_extractor
+from .audio_processing_granite_speech import GraniteSpeechAudioProcessor
 
-import math
-from collections.abc import Sequence
-
-import numpy as np
-
-from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
-from ...tokenization_utils_base import AudioInput
-from ...utils import is_torch_available, is_torchaudio_available, logging
-from ...utils.import_utils import requires_backends
-
-
-logger = logging.get_logger(__name__)
-
-if is_torch_available():
-    import torch
-
-if is_torchaudio_available():
-    import torchaudio
-
-
-class GraniteSpeechFeatureExtractor(FeatureExtractionMixin):
-    model_input_names = ["input_features"]
-
-    def __init__(
-        self,
-        sampling_rate: int = 16000,
-        n_fft: int = 512,
-        win_length: int = 400,
-        hop_length: int = 160,
-        n_mels: int = 80,
-        projector_window_size: int = 15,
-        projector_downsample_rate: int = 5,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.sampling_rate = sampling_rate
-        self.melspec_kwargs = {
-            "sample_rate": sampling_rate,
-            "n_fft": n_fft,
-            "win_length": win_length,
-            "hop_length": hop_length,
-            "n_mels": n_mels,
-        }
-        requires_backends(self, ["torchaudio"])
-        self.mel_filters = torchaudio.transforms.MelSpectrogram(**self.melspec_kwargs)
-        self.projector_window_size = projector_window_size
-        self.projector_downsample_rate = projector_downsample_rate
-
-    def __call__(
-        self,
-        audios: AudioInput,
-        device: str | None = "cpu",
-    ) -> BatchFeature:
-        requires_backends(self, ["torchaudio"])
-
-        speech_inputs = {}
-        batched_audio, audio_lengths = self._get_audios_and_audio_lengths(audios)
-        speech_inputs["input_features"] = self._extract_mel_spectrograms(
-            batched_audio,
-            device=device,
-        )
-        audio_embed_sizes = self._get_num_audio_features(audio_lengths)
-        speech_inputs["audio_embed_sizes"] = audio_embed_sizes
-        # TODO (@alex-jw-brooks): Currently input_features_mask is not
-        # a great name, because input_features and input_features_mask
-        # have different shapes (before/after the projector).
-        #
-        # We should align this with other multimodal models, e.g,. llava
-        # and qwen2audio and refactor this to ensure input_feature_mask
-        # has the same dimensionality as input_features, or compute it in
-        # the model based on the audio embedding sizes (since we do not
-        # have an attention mask for the audio features to infer padding from).
-        speech_inputs["input_features_mask"] = torch.arange(max(audio_embed_sizes)).view(1, -1) < torch.tensor(
-            audio_embed_sizes
-        ).view(-1, 1)
-        return BatchFeature(data=speech_inputs)
-
-    def _extract_mel_spectrograms(self, audio: "torch.Tensor", device="cpu"):
-        """
-        Compute the Mel features to be passed to the conformer encoder.
-        """
-        requires_backends(self, ["torchaudio"])
-        if device is not None:
-            melspec = self.mel_filters.to(device)
-            audio = audio.to(device)
-        else:
-            melspec = self.mel_filters
-
-        bsz = audio.shape[0]
-        with torch.no_grad():
-            # Compute mel features
-            mel = melspec(audio.float())
-            logmel = mel.transpose(-1, -2).clip_(min=1e-10).log10_()
-            mx = logmel.amax(dim=(-2, -1), keepdim=True)
-            logmel = torch.maximum(logmel, mx - 8.0).div_(4).add_(1)
-            # remove last frame if odd
-            if logmel.shape[1] % 2 == 1:
-                logmel = logmel[:, :-1]
-
-            # stacking and skipping by 2
-            audio = logmel.reshape(bsz, -1, 2 * logmel.shape[-1])
-
-        return audio
-
-    def _get_num_audio_features(self, audio_lengths: Sequence[int]) -> Sequence[int]:
-        """
-        Gets the (variable length) number of features (i.e., projector output) for the sequences
-        being considered.
-
-        Args:
-            audio_lengths (`Sequence[int]`):
-                Sequence of one or more raw audio lengths.
-        """
-        hop_length = self.melspec_kwargs["hop_length"]
-        effective_window_size = self.projector_window_size // self.projector_downsample_rate
-
-        projector_lengths = []
-        for raw_length in audio_lengths:
-            # mel sequence length computation
-            mel_length = raw_length // hop_length + 1
-            # encoder frame takes two mel features
-            encoder_length = mel_length // 2
-            nblocks = math.ceil(encoder_length / self.projector_window_size)
-            # projector output length
-            projector_length = nblocks * effective_window_size
-            projector_lengths.append(projector_length)
-
-        return projector_lengths
-
-    def _get_audios_and_audio_lengths(self, audios: AudioInput) -> Sequence["torch.Tensor", Sequence[int]]:
-        """
-        Coerces audio inputs to torch tensors and extracts audio lengths prior to stacking.
-
-        Args:
-            audios (`AudioInput`):
-                Audio sequence, numpy array, or torch tensor.
-        """
-        requires_backends(self, ["torch"])
-
-        # Coerce to PyTorch tensors if we have numpy arrays, since
-        # currently we have a dependency on torch/torchaudio anyway
-        if isinstance(audios, np.ndarray):
-            audios = torch.from_numpy(audios)
-        elif isinstance(audios, Sequence) and isinstance(audios[0], np.ndarray):
-            audios = [torch.from_numpy(arr) for arr in audios]
-
-        if isinstance(audios, torch.Tensor):
-            if audios.ndim == 1:
-                audios = audios.unsqueeze(0)
-            if not torch.is_floating_point(audios):
-                raise ValueError("Invalid audio provided. Audio should be a floating point between 0 and 1")
-
-            if audios.shape[0] > 1:
-                logger.warning("Audio samples are already collated; assuming they all have the same length")
-            lengths = [audios.shape[-1]] * audios.shape[0]
-            return audios, lengths
-
-        elif isinstance(audios, Sequence) and isinstance(audios[0], torch.Tensor):
-            if not torch.is_floating_point(audios[0]):
-                raise ValueError("Invalid audio provided. Audio should be a floating point between 0 and 1")
-            lengths = [audio.shape[-1] for audio in audios]
-            audios = [audio.squeeze(0) for audio in audios]
-            audios = torch.nn.utils.rnn.pad_sequence(audios, batch_first=True, padding_value=0.0)
-            return audios, lengths
-
-        raise TypeError("Invalid audio provided. Audio should be a one or more torch tensors or numpy arrays")
 
+GraniteSpeechFeatureExtractor = deprecated_feature_extractor(
+    GraniteSpeechAudioProcessor, "GraniteSpeechFeatureExtractor"
+)
 
 __all__ = ["GraniteSpeechFeatureExtractor"]
diff --git a/src/transformers/models/kyutai_speech_to_text/audio_processing_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/audio_processing_kyutai_speech_to_text.py
new file mode 100644
index 000000000000..a07b213a2c9d
--- /dev/null
+++ b/src/transformers/models/kyutai_speech_to_text/audio_processing_kyutai_speech_to_text.py
@@ -0,0 +1,43 @@
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ...audio_processing_backends import NumpyAudioBackend
+
+
+class KyutaiSpeechToTextAudioProcessor(NumpyAudioBackend):
+    sample_rate = 24000
+    force_mono = True
+    add_channel_dim = True
+    audio_delay_seconds = 2.5
+    audio_silence_prefix_seconds = 1.0
+
+    def _postprocess_output(self, output, **kwargs):
+        # Add silence prefix (left) and delay (right) padding
+        pad_left = int(self.audio_silence_prefix_seconds * self.sample_rate)
+        pad_right = int((self.audio_delay_seconds + 1.0) * self.sample_rate)
+
+        if pad_left > 0 or pad_right > 0:
+            output["audio_values"] = np.pad(
+                output["audio_values"], [(0, 0), (0, 0), (pad_left, pad_right)], mode="constant", constant_values=0.0,
+            )
+            output["audio_values_mask"] = np.pad(
+                output["audio_values_mask"], [(0, 0), (pad_left, pad_right)], mode="constant", constant_values=0,
+            )
+
+        return output
+
+
+__all__ = ["KyutaiSpeechToTextAudioProcessor"]
diff --git a/src/transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py
index b472473a19e5..5abc645f3f8a 100644
--- a/src/transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py
+++ b/src/transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py
@@ -1,10 +1,7 @@
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#           This file was automatically generated from src/transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py.
-#               Do NOT edit this file manually as any edits will be overwritten by the generation of
-#             the file from the modular. If any change should be done, please apply the change to the
-#                          modular_kyutai_speech_to_text.py file directly. One of our CI enforces this.
+# This file is now a thin backward-compatibility wrapper. The original was auto-generated from modular_kyutai_speech_to_text.py.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-# Copyright 2025 Kyutai and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,218 +14,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from ...utils.deprecation import deprecated_feature_extractor
+from .audio_processing_kyutai_speech_to_text import KyutaiSpeechToTextAudioProcessor
 
 
-import numpy as np
-
-from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
-from ...feature_extraction_utils import BatchFeature
-from ...utils import PaddingStrategy, TensorType, logging
-
-
-logger = logging.get_logger(__name__)
-
-
-class KyutaiSpeechToTextFeatureExtractor(SequenceFeatureExtractor):
-    r"""
-    Constructs an KyutaiSpeechToText feature extractor.
-
-    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
-    most of the main methods. Users should refer to this superclass for more information regarding those methods.
-
-    Args:
-        feature_size (`int`, *optional*, defaults to 1):
-            The feature dimension of the extracted features. Use 1 for mono, 2 for stereo.
-        sampling_rate (`int`, *optional*, defaults to 24000):
-            The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).
-        padding_value (`float`, *optional*, defaults to 0.0):
-            The value that is used to fill the padding values.
-        chunk_length_s (`float`, *optional*):
-            If defined the audio is pre-processed into chunks of lengths `chunk_length_s` and then encoded.
-        overlap (`float`, *optional*):
-            Defines the overlap between each chunk. It is used to compute the `chunk_stride` using the following
-            formulae : `int((1.0 - self.overlap) * self.chunk_length)`.
-        audio_delay_seconds (`float`, *optional*, defaults to 0.0):
-            The delay in seconds to add after the audio (right padding).
-        audio_silence_prefix_seconds (`float`, *optional*, defaults to 0.0):
-            The silence prefix in seconds to add before the audio (left padding).
-    """
-
-    model_input_names = ["input_values", "padding_mask"]
-
-    def __init__(
-        self,
-        feature_size: int = 1,
-        sampling_rate: int = 24000,
-        padding_value: float = 0.0,
-        chunk_length_s: float | None = None,
-        overlap: float | None = None,
-        audio_delay_seconds: float | None = 0.0,
-        audio_silence_prefix_seconds: float | None = 0.0,
-        **kwargs,
-    ):
-        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
-        self.chunk_length_s = chunk_length_s
-        self.overlap = overlap
-        self.audio_delay_seconds = audio_delay_seconds
-        self.audio_silence_prefix_seconds = audio_silence_prefix_seconds
-
-    # This is a property because you might want to change the chunk_length_s on the fly
-    @property
-    def chunk_length(self) -> int | None:
-        if self.chunk_length_s is None:
-            return None
-        else:
-            return int(self.chunk_length_s * self.sampling_rate)
-
-    # This is a property because you might want to change the chunk_length_s on the fly
-    @property
-    def chunk_stride(self) -> int | None:
-        if self.chunk_length_s is None or self.overlap is None:
-            return None
-        else:
-            return max(1, int((1.0 - self.overlap) * self.chunk_length))
-
-    def __call__(
-        self,
-        raw_audio: np.ndarray | list[float] | list[np.ndarray] | list[list[float]],
-        padding: bool | str | PaddingStrategy | None = None,
-        truncation: bool | None = False,
-        max_length: int | None = None,
-        return_tensors: str | TensorType | None = None,
-        sampling_rate: int | None = None,
-    ) -> BatchFeature:
-        """
-        Main method to featurize and prepare for the model one or several sequence(s).
-
-        Args:
-            raw_audio (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
-                The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
-                values, a list of numpy arrays or a list of list of float values. The numpy array must be of shape
-                `(num_samples,)` for mono audio (`feature_size = 1`), or `(2, num_samples)` for stereo audio
-                (`feature_size = 2`).
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
-                Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                index) among:
-
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            truncation (`bool`, *optional*, defaults to `False`):
-                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
-            max_length (`int`, *optional*):
-                Maximum length of the returned list and optionally padding length (see above).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors instead of list of python integers. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return Numpy `np.ndarray` objects.
-            sampling_rate (`int`, *optional*):
-                The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass
-                `sampling_rate` at the forward call to prevent silent errors.
-        """
-        if sampling_rate is not None:
-            if sampling_rate != self.sampling_rate:
-                raise ValueError(
-                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
-                    f" {self.sampling_rate}. Please make sure that the provided audio input was sampled with"
-                    f" {self.sampling_rate} and not {sampling_rate}."
-                )
-        else:
-            logger.warning(
-                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
-                "Failing to do so can result in silent errors that might be hard to debug."
-            )
-
-        if padding and truncation:
-            raise ValueError("Both padding and truncation were set. Make sure you only set one.")
-        elif padding is None:
-            # by default let's pad the inputs
-            padding = True
-
-        is_batched = bool(
-            isinstance(raw_audio, (list, tuple)) and (isinstance(raw_audio[0], (np.ndarray, tuple, list)))
-        )
-
-        if is_batched:
-            raw_audio = [np.asarray(audio, dtype=np.float32).T for audio in raw_audio]
-        elif not is_batched and not isinstance(raw_audio, np.ndarray):
-            raw_audio = np.asarray(raw_audio, dtype=np.float32)
-        elif isinstance(raw_audio, np.ndarray) and raw_audio.dtype is np.dtype(np.float64):
-            raw_audio = raw_audio.astype(np.float32)
-
-        # always return batch
-        if not is_batched:
-            raw_audio = [np.asarray(raw_audio).T]
-
-        # verify inputs are valid
-        for idx, example in enumerate(raw_audio):
-            if example.ndim > 2:
-                raise ValueError(f"Expected input shape (channels, length) but got shape {example.shape}")
-            if self.feature_size == 1 and example.ndim != 1:
-                raise ValueError(f"Expected mono audio but example has {example.shape[-1]} channels")
-            if self.feature_size == 2 and example.shape[-1] != 2:
-                raise ValueError(f"Expected stereo audio but example has {example.shape[-1]} channels")
-
-        padded_inputs = None
-        input_values = BatchFeature({"input_values": raw_audio})
-        if self.chunk_stride is not None and self.chunk_length is not None and max_length is None:
-            if truncation:
-                max_length = min(array.shape[0] for array in raw_audio)
-                nb_step = int(np.floor(max_length / self.chunk_stride))
-                max_length = (nb_step - 1) * self.chunk_stride + self.chunk_length
-            elif padding:
-                max_length = max(array.shape[0] for array in raw_audio)
-                nb_step = int(np.ceil(max_length / self.chunk_stride))
-                max_length = (nb_step - 1) * self.chunk_stride + self.chunk_length
-                padding = "max_length"
-            else:
-                padded_inputs = input_values
-
-        # normal padding on batch
-        if padded_inputs is None:
-            padded_inputs = self.pad(
-                input_values,
-                max_length=max_length,
-                truncation=truncation,
-                padding=padding,
-                return_attention_mask=padding,
-            )
-
-            if padding:
-                padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask")
-
-        # now let's pad left and right
-        pad_left = int(self.audio_silence_prefix_seconds * self.sampling_rate)
-        pad_right = int((self.audio_delay_seconds + 1.0) * self.sampling_rate)
-        padded_inputs["input_values"] = np.pad(
-            padded_inputs["input_values"],
-            ((0, 0), (pad_left, pad_right)),
-            mode="constant",
-            constant_values=0.0,
-        )
-        if padding:
-            padded_inputs["padding_mask"] = np.pad(
-                padded_inputs["padding_mask"],
-                ((0, 0), (pad_left, pad_right)),
-                mode="constant",
-                constant_values=0,
-            )
-
-        input_values = []
-        for example in padded_inputs.pop("input_values"):
-            if self.feature_size == 1:
-                example = example[..., None]
-            input_values.append(example.T)
-
-        padded_inputs["input_values"] = input_values
-        if return_tensors is not None:
-            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
-
-        return padded_inputs
-
+KyutaiSpeechToTextFeatureExtractor = deprecated_feature_extractor(
+    KyutaiSpeechToTextAudioProcessor, "KyutaiSpeechToTextFeatureExtractor"
+)
 
 __all__ = ["KyutaiSpeechToTextFeatureExtractor"]
diff --git a/src/transformers/models/lasr/audio_processing_lasr.py b/src/transformers/models/lasr/audio_processing_lasr.py
new file mode 100644
index 000000000000..a1b581628988
--- /dev/null
+++ b/src/transformers/models/lasr/audio_processing_lasr.py
@@ -0,0 +1,55 @@
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from ...audio_processing_backends import TorchAudioBackend
+from ...audio_utils import MelScaleConfig, SpectrogramConfig, StftConfig
+
+
+class LasrAudioProcessor(TorchAudioBackend):
+    sample_rate = 16000
+    force_mono = True
+    spectrogram_config = SpectrogramConfig(
+        stft_config=StftConfig(
+            n_fft=512,
+            hop_length=160,
+            win_length=400,
+            power=2.0,
+            center=False,
+            periodic=False,
+            left_align_fft=True,
+        ),
+        mel_scale_config=MelScaleConfig(
+            n_mels=128,
+            f_min=125.0,
+            f_max=7500.0,
+            mel_scale="kaldi",
+            triangularize_in_mel_space=True,
+            bands_to_zero=1,
+            computation_dtype="float64",
+            matmul_order="features_first",
+        ),
+        log_mode="log",
+        mel_floor=1e-5,
+        computation_dtype="float64",
+    )
+
+    def _get_features_lengths(self, audio_lengths, spectrogram_config, include_center_frame=False):
+        stft_cfg = spectrogram_config.stft_config
+        win_length = stft_cfg.win_length or stft_cfg.n_fft
+        return (audio_lengths - win_length) // stft_cfg.hop_length + 1
+
+
+__all__ = ["LasrAudioProcessor"]
diff --git a/src/transformers/models/lasr/feature_extraction_lasr.py b/src/transformers/models/lasr/feature_extraction_lasr.py
index 7cf1822ee40d..90b1954ec5f2 100644
--- a/src/transformers/models/lasr/feature_extraction_lasr.py
+++ b/src/transformers/models/lasr/feature_extraction_lasr.py
@@ -11,265 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from ...utils.deprecation import deprecated_feature_extractor
+from .audio_processing_lasr import LasrAudioProcessor
 
-import numpy as np
-import torch
-
-from ...audio_utils import hertz_to_mel
-from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
-from ...feature_extraction_utils import BatchFeature
-from ...utils import TensorType, logging
-from ...utils.import_utils import requires
-
-
-logger = logging.get_logger(__name__)
-
-
-# TODO: @eustlb, we should be able to remove this and use mel_filter_bank from audio_utils
-def linear_to_mel_weight_matrix(
-    num_mel_bins: int,
-    num_spectrogram_bins: int,
-    sample_rate: float,
-    lower_edge_hertz: float,
-    upper_edge_hertz: float,
-    dtype,
-) -> np.ndarray:
-    """NumPy-port of the JAX mel weight matrix logic."""
-    # We use float64 for precision, matching the JAX implementation.
-    internal_dtype = np.float64
-
-    # HTK excludes the spectrogram DC bin.
-    bands_to_zero = 1
-    nyquist_hertz = sample_rate / 2.0
-    linear_frequencies = np.linspace(0.0, nyquist_hertz, num_spectrogram_bins, dtype=internal_dtype)[bands_to_zero:]
-    spectrogram_bins_mel = hertz_to_mel(linear_frequencies, mel_scale="kaldi")[:, np.newaxis]
-
-    edges = np.linspace(
-        hertz_to_mel(lower_edge_hertz, mel_scale="kaldi"),
-        hertz_to_mel(upper_edge_hertz, mel_scale="kaldi"),
-        num_mel_bins + 2,
-        dtype=internal_dtype,
-    )
-
-    lower_edge_mel, center_mel, upper_edge_mel = (
-        edges[:-2][np.newaxis, :],
-        edges[1:-1][np.newaxis, :],
-        edges[2:][np.newaxis, :],
-    )
-
-    lower_slopes = (spectrogram_bins_mel - lower_edge_mel) / (center_mel - lower_edge_mel)
-    upper_slopes = (upper_edge_mel - spectrogram_bins_mel) / (upper_edge_mel - center_mel)
-    mel_weights_matrix = np.maximum(0.0, np.minimum(lower_slopes, upper_slopes))
-    return np.pad(mel_weights_matrix, [[bands_to_zero, 0], [0, 0]]).astype(dtype)
-
-
-@requires(backends=("torch",))
-class LasrFeatureExtractor(SequenceFeatureExtractor):
-    r"""
-    Constructs a LASR feature extractor.
-
-    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
-    most of the main methods. Users should refer to this superclass for more information regarding those methods.
-
-    This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the `Short Time
-    Fourier Transform` which should match pytorch's `torch.stft` equivalent.
-
-    Args:
-            feature_size (`int`, *optional*, defaults to 128):
-                The feature dimension of the extracted features.
-            sampling_rate (`int`, *optional*, defaults to 16000):
-                The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
-            hop_length (`int`, *optional*, defaults to 160):
-                Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
-            n_fft (`int`, *optional*, defaults to 512):
-                Size of the Fourier transform.
-            win_length (`int`, *optional*, defaults to 400):
-                The window length for the STFT computation.
-            padding_value (`float`, *optional*, defaults to 0.0):
-                Padding value used to pad the audio. Should correspond to silences.
-    """
-
-    model_input_names = ["input_features", "attention_mask"]
-
-    def __init__(
-        self,
-        feature_size=128,
-        sampling_rate=16000,
-        hop_length=160,
-        n_fft=512,
-        win_length=400,
-        padding_value=0.0,
-        **kwargs,
-    ):
-        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
-
-        self.hop_length = hop_length
-        self.n_fft = n_fft
-        self.win_length = win_length
-        self.mel_filters = torch.from_numpy(
-            linear_to_mel_weight_matrix(
-                num_mel_bins=feature_size,
-                num_spectrogram_bins=n_fft // 2 + 1,
-                sample_rate=sampling_rate,
-                lower_edge_hertz=125.0,
-                upper_edge_hertz=7500.0,
-                dtype=np.float64,
-            )
-        )
-
-    def _torch_extract_fbank_features(self, waveform, device="cpu"):
-        # spectrogram
-        window = torch.hann_window(self.win_length, periodic=False, device=device, dtype=torch.float64)
-        waveform = waveform.to(torch.float64)
-
-        # TODO: @eustlb, to be standardized
-        # here we cannot use directly torch.stft because every fft frame is padded with zeros
-        # due to unfold then rfft, while torch.stft unfolds with the number of fft points
-        frames = waveform.unfold(-1, self.win_length, self.hop_length)
-        stft = torch.fft.rfft(window * frames, n=self.n_fft)
-        power_spec = torch.abs(stft) ** 2
-
-        # log mel spectrogram
-        mel_filters = self.mel_filters.to(device)
-        mel_spec = torch.clamp(power_spec @ mel_filters, min=1e-5)
-        mel_spec = torch.log(mel_spec)
-
-        return mel_spec
-
-    def __call__(
-        self,
-        raw_speech: np.ndarray | list[float] | list[np.ndarray] | list[list[float]],
-        truncation: bool = False,
-        pad_to_multiple_of: int | None = None,
-        return_tensors: str | TensorType | None = None,
-        return_attention_mask: bool | None = None,
-        padding: str | None = "longest",
-        max_length: int | None = None,
-        sampling_rate: int | None = None,
-        do_normalize: bool | None = None,
-        device: str | None = "cpu",
-        return_token_timestamps: bool | None = None,
-        **kwargs,
-    ) -> BatchFeature:
-        """
-        Main method to featurize and prepare for the model one or several sequence(s). Implementation uses PyTorch for
-        the STFT computation if available, otherwise a slower NumPy based one.
-
-        Args:
-            raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
-                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
-                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
-                stereo, i.e. single float per timestep.
-            truncation (`bool`, *optional*, default to `True`):
-                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
-            pad_to_multiple_of (`int`, *optional*, defaults to None):
-                If set will pad the sequence to a multiple of the provided value.
-
-                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
-                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
-            return_attention_mask (`bool`, *optional*):
-                Whether to return the attention mask. If left to the default, will return the attention mask according
-                to the specific feature_extractor's default.
-
-                [What are attention masks?](../glossary#attention-mask)
-
-                <Tip>
-
-                For Parakeet models, `attention_mask` should always be passed for batched inference, to avoid subtle
-                bugs.
-
-                </Tip>
-
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors instead of list of python integers. Acceptable values are:
-
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return Numpy `np.ndarray` objects.
-            sampling_rate (`int`, *optional*):
-                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
-                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
-                pipeline.
-            padding_value (`float`, *optional*, defaults to 0.0):
-                The value that is used to fill the padding values / vectors.
-            do_normalize (`bool`, *optional*, defaults to `False`):
-                Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
-                improve the performance of the model.
-            device (`str`, *optional*, defaults to `'cpu'`):
-                Specifies the device for computation of the log-mel spectrogram of audio signals in the
-                `_torch_extract_fbank_features` method. (e.g., "cpu", "cuda")
-            return_token_timestamps (`bool`, *optional*, defaults to `None`):
-                Deprecated. Use `return_attention_mask` instead from which the number of frames can be inferred.
-
-                Whether or not to return the number of frames of the input raw_speech.
-                These num_frames can be used by the model to compute word level timestamps.
-        """
-        if sampling_rate is not None:
-            if sampling_rate != self.sampling_rate:
-                raise ValueError(
-                    f"The model corresponding to this feature extractor: {self.__class__.__name__} was trained using a"
-                    f" sampling rate of {self.sampling_rate}. Please make sure that the provided `raw_speech` input"
-                    f" was sampled with {self.sampling_rate} and not {sampling_rate}."
-                )
-        else:
-            logger.warning(
-                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
-                "Failing to do so can result in silent errors that might be hard to debug."
-            )
-
-        # Convert to torch tensor
-        if isinstance(raw_speech, np.ndarray):
-            raw_speech = torch.tensor(raw_speech)
-        elif isinstance(raw_speech, (list, tuple)):
-            if isinstance(raw_speech[0], (list, np.ndarray)):
-                raw_speech = [torch.tensor(speech) for speech in raw_speech]
-            else:  # list[float]
-                raw_speech = torch.tensor(raw_speech)
-
-        is_batched_torch = isinstance(raw_speech, torch.Tensor) and len(raw_speech.shape) > 1
-        if is_batched_torch and len(raw_speech.shape) > 2:
-            logger.warning(
-                f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
-                "We will take the mean of the channels to convert to mono."
-            )
-            raw_speech = raw_speech.mean(-1)
-
-        is_batched_sequence = isinstance(raw_speech, (list, tuple))
-        if is_batched_sequence:
-            for speech in raw_speech:
-                if len(speech.shape) > 1:
-                    logger.warning(
-                        f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
-                        "We will take the mean of the channels to convert to mono."
-                    )
-                    speech = speech.mean(-1)
-
-        if is_batched_torch or is_batched_sequence:
-            raw_speech = [speech[:, None].to(torch.float32) for speech in raw_speech]
-        else:
-            raw_speech = [raw_speech[:, None].to(torch.float32)]
-
-        batched_speech = BatchFeature({"input_features": raw_speech})
-        padded_inputs = self.pad(
-            batched_speech,
-            padding=padding,
-            max_length=max_length,
-            truncation=truncation,
-            pad_to_multiple_of=pad_to_multiple_of,
-            return_attention_mask=return_attention_mask,
-            return_tensors="pt",
-        )
-        input_features = padded_inputs.input_features.squeeze(-1)
-        input_features = self._torch_extract_fbank_features(input_features, device)
-        data = {
-            "input_features": input_features.to(torch.float32),
-        }
-
-        if return_attention_mask:
-            attention_mask = padded_inputs.attention_mask[:, self.win_length - 1 :: self.hop_length]
-            data["attention_mask"] = attention_mask.to(torch.bool)
-
-        return BatchFeature(data=data, tensor_type=return_tensors)
 
+LasrFeatureExtractor = deprecated_feature_extractor(LasrAudioProcessor, "LasrFeatureExtractor")
 
 __all__ = ["LasrFeatureExtractor"]
diff --git a/src/transformers/models/musicgen_melody/audio_processing_musicgen_melody.py b/src/transformers/models/musicgen_melody/audio_processing_musicgen_melody.py
new file mode 100644
index 000000000000..1585ffae93d0
--- /dev/null
+++ b/src/transformers/models/musicgen_melody/audio_processing_musicgen_melody.py
@@ -0,0 +1,81 @@
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...audio_processing_backends import TorchAudioBackend
+from ...utils.import_utils import requires
+
+
+class MusicgenMelodyAudioProcessor(TorchAudioBackend):
+    sample_rate = 32000
+    force_mono = True
+    do_extract_spectrogram = True
+    return_padding_mask = False
+    n_fft = 16384
+    hop_length = 4096
+    n_chroma = 12
+    chunk_length = 30
+
+    @requires(backends=("librosa", "torch"))
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        import librosa
+        import torch
+
+        self.chroma_filters = torch.from_numpy(
+            librosa.filters.chroma(sr=self.sample_rate, n_fft=self.n_fft, tuning=0, n_chroma=self.n_chroma)
+        ).float()
+
+    def extract_spectrogram(self, audio, **kwargs):
+        import torch
+        import torchaudio
+
+        waveform = audio  # Already a batched tensor from _to_batch
+        device = waveform.device
+        batch_size = waveform.shape[0]
+
+        # Pad if too short for FFT
+        if waveform.shape[-1] < self.n_fft:
+            pad = self.n_fft - waveform.shape[-1]
+            rest = 0 if pad % 2 == 0 else 1
+            waveform = torch.nn.functional.pad(waveform, (pad // 2, pad // 2 + rest), "constant", 0)
+
+        # Add channel dim for spectrogram: (batch, 1, length)
+        waveform = waveform.unsqueeze(1)
+
+        # Power spectrogram (normalized)
+        spec_transform = torchaudio.transforms.Spectrogram(
+            n_fft=self.n_fft, win_length=self.n_fft, hop_length=self.hop_length,
+            power=2, center=True, pad=0, normalized=True,
+        ).to(device)
+        spec = spec_transform(waveform).squeeze(1)
+
+        # Chroma features
+        chroma_filters = self.chroma_filters.to(device)
+        raw_chroma = torch.einsum("cf, ...ft->...ct", chroma_filters, spec)
+
+        # Normalize with inf norm
+        norm_chroma = torch.nn.functional.normalize(raw_chroma, p=float("inf"), dim=-2, eps=1e-6)
+
+        # Transpose: (batch, chroma, frames) -> (batch, frames, chroma)
+        norm_chroma = norm_chroma.transpose(1, 2)
+
+        # One-hot encoding: argmax along chroma dim
+        idx = norm_chroma.argmax(-1, keepdim=True)
+        norm_chroma[:] = 0
+        norm_chroma.scatter_(dim=-1, index=idx, value=1)
+
+        return norm_chroma
+
+
+__all__ = ["MusicgenMelodyAudioProcessor"]
diff --git a/src/transformers/models/musicgen_melody/feature_extraction_musicgen_melody.py b/src/transformers/models/musicgen_melody/feature_extraction_musicgen_melody.py
index 1811fa11e630..c41ea0666292 100644
--- a/src/transformers/models/musicgen_melody/feature_extraction_musicgen_melody.py
+++ b/src/transformers/models/musicgen_melody/feature_extraction_musicgen_melody.py
@@ -11,324 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
-Feature extractor class for Musicgen Melody
-"""
+from ...utils.deprecation import deprecated_feature_extractor
+from .audio_processing_musicgen_melody import MusicgenMelodyAudioProcessor
 
-import copy
-from typing import Any
-
-import numpy as np
-
-from ...audio_utils import chroma_filter_bank
-from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
-from ...feature_extraction_utils import BatchFeature
-from ...utils import TensorType, is_torch_available, is_torchaudio_available, logging
-from ...utils.import_utils import requires
-
-
-if is_torch_available():
-    import torch
-
-if is_torchaudio_available():
-    import torchaudio
-
-logger = logging.get_logger(__name__)
-
-
-@requires(backends=("torchaudio",))
-class MusicgenMelodyFeatureExtractor(SequenceFeatureExtractor):
-    r"""
-    Constructs a MusicgenMelody feature extractor.
-
-    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
-    most of the main methods. Users should refer to this superclass for more information regarding those methods.
-
-    This class extracts chroma features from audio processed by [Demucs](https://github.com/adefossez/demucs/tree/main) or
-    directly from raw audio waveform.
-
-    Args:
-        feature_size (`int`, *optional*, defaults to 12):
-            The feature dimension of the extracted features.
-        sampling_rate (`int`, *optional*, defaults to 32000):
-            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
-        hop_length (`int`, *optional*, defaults to 4096):
-            Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
-        chunk_length (`int`, *optional*, defaults to 30):
-            The maximum number of chunks of `sampling_rate` samples used to trim and pad longer or shorter audio
-            sequences.
-        n_fft (`int`, *optional*, defaults to 16384):
-            Size of the Fourier transform.
-        num_chroma (`int`, *optional*, defaults to 12):
-            Number of chroma bins to use.
-        padding_value (`float`, *optional*, defaults to 0.0):
-            Padding value used to pad the audio.
-        return_attention_mask (`bool`, *optional*, defaults to `False`):
-            Whether to return the attention mask. Can be overwritten when calling the feature extractor.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            <Tip>
-
-            For Whisper models, `attention_mask` should always be passed for batched inference, to avoid subtle
-            bugs.
-
-            </Tip>
-        stem_indices (`list[int]`, *optional*, defaults to `[3, 2]`):
-            Stem channels to extract if demucs outputs are passed.
-    """
-
-    model_input_names = ["input_features"]
-
-    def __init__(
-        self,
-        feature_size=12,
-        sampling_rate=32000,
-        hop_length=4096,
-        chunk_length=30,
-        n_fft=16384,
-        num_chroma=12,
-        padding_value=0.0,
-        return_attention_mask=False,  # pad inputs to max length with silence token (zero) and no attention mask
-        stem_indices=[3, 2],
-        **kwargs,
-    ):
-        super().__init__(
-            feature_size=feature_size,
-            sampling_rate=sampling_rate,
-            padding_value=padding_value,
-            return_attention_mask=return_attention_mask,
-            **kwargs,
-        )
-        self.n_fft = n_fft
-        self.hop_length = hop_length
-        self.chunk_length = chunk_length
-        self.n_samples = chunk_length * sampling_rate
-        self.sampling_rate = sampling_rate
-        self.chroma_filters = torch.from_numpy(
-            chroma_filter_bank(sampling_rate=sampling_rate, num_frequency_bins=n_fft, tuning=0, num_chroma=num_chroma)
-        ).float()
-        self.spectrogram = torchaudio.transforms.Spectrogram(
-            n_fft=n_fft, win_length=n_fft, hop_length=hop_length, power=2, center=True, pad=0, normalized=True
-        )
-        self.stem_indices = stem_indices
-
-    def _torch_extract_fbank_features(self, waveform: torch.Tensor) -> torch.Tensor:
-        """
-        Compute the chroma spectrogram of the provided audio using the torchaudio spectrogram implementation and the librosa chroma features.
-        """
-
-        # if wav length is not long enough, pad it
-        wav_length = waveform.shape[-1]
-        if wav_length < self.n_fft:
-            pad = self.n_fft - wav_length
-            rest = 0 if pad % 2 == 0 else 1
-            waveform = torch.nn.functional.pad(waveform, (pad // 2, pad // 2 + rest), "constant", 0)
-
-        # squeeze alongside channel dimension
-        spec = self.spectrogram(waveform).squeeze(1)
-
-        # sum along the frequency dimension
-        raw_chroma = torch.einsum("cf, ...ft->...ct", self.chroma_filters, spec)
-
-        # normalise with max value
-        norm_chroma = torch.nn.functional.normalize(raw_chroma, p=float("inf"), dim=-2, eps=1e-6)
-
-        # transpose time and chroma dimension -> (batch, time, chroma)
-        norm_chroma = norm_chroma.transpose(1, 2)
-
-        # replace max value alongside chroma dimension with 1 and replace the rest with 0
-        idx = norm_chroma.argmax(-1, keepdim=True)
-        norm_chroma[:] = 0
-        norm_chroma.scatter_(dim=-1, index=idx, value=1)
-
-        return norm_chroma
-
-    def _extract_stem_indices(self, audio, sampling_rate=None):
-        """
-        Extracts stems from the output of the [Demucs](https://github.com/adefossez/demucs/tree/main) audio separation model,
-        then converts to mono-channel and resample to the feature extractor sampling rate.
-
-        Args:
-            audio (`torch.Tensor` of shape `(batch_size, num_stems, channel_size, audio_length)`):
-                The output of the Demucs model to be processed.
-            sampling_rate (`int`, *optional*):
-                Demucs sampling rate. If not specified, defaults to `44000`.
-        """
-        sampling_rate = 44000 if sampling_rate is None else sampling_rate
-
-        # extract "vocals" and "others" sources from audio encoder (demucs) output
-        # [batch_size, num_stems, channel_size, audio_length]
-        wav = audio[:, torch.tensor(self.stem_indices)]
-
-        # merge extracted stems to single waveform
-        wav = wav.sum(1)
-
-        # convert to mono-channel waveform
-        wav = wav.mean(dim=1, keepdim=True)
-
-        # resample to model sampling rate
-        # not equivalent to julius.resample
-        if sampling_rate != self.sampling_rate:
-            wav = torchaudio.functional.resample(
-                wav, sampling_rate, self.sampling_rate, rolloff=0.945, lowpass_filter_width=24
-            )
-
-        # [batch_size, 1, audio_length] -> [batch_size, audio_length]
-        wav = wav.squeeze(1)
-
-        return wav
-
-    def __call__(
-        self,
-        audio: np.ndarray | list[float] | list[np.ndarray] | list[list[float]],
-        truncation: bool = True,
-        pad_to_multiple_of: int | None = None,
-        return_tensors: str | TensorType | None = None,
-        return_attention_mask: bool | None = None,
-        padding: str | None = True,
-        max_length: int | None = None,
-        sampling_rate: int | None = None,
-        **kwargs,
-    ) -> BatchFeature:
-        """
-        Main method to featurize and prepare for the model one or several sequence(s).
-
-        Args:
-            audio (`torch.Tensor`, `np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[torch.Tensor]`, `list[list[float]]`):
-                The sequence or batch of sequences to be padded. Each sequence can be a torch tensor, a numpy array, a list of float
-                values, a list of numpy arrays, a list of torch tensors, or a list of list of float values.
-                If `audio` is the output of Demucs, it has to be a torch tensor of shape `(batch_size, num_stems, channel_size, audio_length)`.
-                Otherwise, it must be mono or stereo channel audio.
-            truncation (`bool`, *optional*, default to `True`):
-                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
-            pad_to_multiple_of (`int`, *optional*, defaults to None):
-                If set will pad the sequence to a multiple of the provided value.
-
-                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
-                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors instead of list of python integers. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return Numpy `np.ndarray` objects.
-            return_attention_mask (`bool`, *optional*):
-                Whether to return the attention mask. If left to the default, will return the attention mask according
-                to the specific feature_extractor's default.
-
-                [What are attention masks?](../glossary#attention-mask)
-
-                <Tip>
-                For Musicgen Melody models, audio `attention_mask` is not necessary.
-                </Tip>
-
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
-                Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                index) among:
-
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            max_length (`int`, *optional*):
-                Maximum length of the returned list and optionally padding length (see above).
-            sampling_rate (`int`, *optional*):
-                The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass
-                `sampling_rate` at the forward call to prevent silent errors.
-                Note that if `audio` is the output of Demucs, `sampling_rate` must be the sampling rate at which Demucs operates.
-        """
-
-        if sampling_rate is None:
-            logger.warning_once(
-                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
-                "Failing to do so can result in silent errors that might be hard to debug."
-            )
-
-        if isinstance(audio, torch.Tensor) and len(audio.shape) == 4:
-            logger.warning_once(
-                "`audio` is a 4-dimensional torch tensor and has thus been recognized as the output of `Demucs`. "
-                "If this is not the case, make sure to read Musicgen Melody docstrings and "
-                "to correct `audio` to get the right behaviour."
-                "Link to the docstrings: https://huggingface.co/docs/transformers/main/en/model_doc/musicgen_melody"
-            )
-            audio = self._extract_stem_indices(audio, sampling_rate=sampling_rate)
-        elif sampling_rate is not None and sampling_rate != self.sampling_rate:
-            audio = torchaudio.functional.resample(
-                audio, sampling_rate, self.sampling_rate, rolloff=0.945, lowpass_filter_width=24
-            )
-
-        is_batched = isinstance(audio, (np.ndarray, torch.Tensor)) and len(audio.shape) > 1
-        is_batched = is_batched or (
-            isinstance(audio, (list, tuple)) and (isinstance(audio[0], (torch.Tensor, np.ndarray, tuple, list)))
-        )
-
-        if is_batched and not isinstance(audio[0], torch.Tensor):
-            audio = [torch.tensor(speech, dtype=torch.float32).unsqueeze(-1) for speech in audio]
-        elif is_batched:
-            audio = [speech.unsqueeze(-1) for speech in audio]
-        elif not is_batched and not isinstance(audio, torch.Tensor):
-            audio = torch.tensor(audio, dtype=torch.float32).unsqueeze(-1)
-
-        if isinstance(audio[0], torch.Tensor) and audio[0].dtype is torch.float64:
-            audio = [speech.to(torch.float32) for speech in audio]
-
-        # always return batch
-        if not is_batched:
-            audio = [audio]
-
-        if len(audio[0].shape) == 3:
-            logger.warning_once(
-                "`audio` has been detected as a batch of stereo signals. Will be convert to mono signals. "
-                "If this is an undesired behaviour, make sure to read Musicgen Melody docstrings and "
-                "to correct `audio` to get the right behaviour."
-                "Link to the docstrings: https://huggingface.co/docs/transformers/main/en/model_doc/musicgen_melody"
-            )
-            # convert to mono-channel waveform
-            audio = [stereo.mean(dim=0) for stereo in audio]
-
-        batched_speech = BatchFeature({"input_features": audio})
-
-        padded_inputs = self.pad(
-            batched_speech,
-            padding=padding,
-            max_length=max_length if max_length else self.n_samples,
-            truncation=truncation,
-            pad_to_multiple_of=pad_to_multiple_of,
-            return_attention_mask=return_attention_mask,
-            return_tensors="pt",
-        )
-
-        input_features = self._torch_extract_fbank_features(padded_inputs["input_features"].squeeze(-1))
-
-        padded_inputs["input_features"] = input_features
-
-        if return_attention_mask:
-            # rescale from raw audio length to spectrogram length
-            padded_inputs["attention_mask"] = padded_inputs["attention_mask"][:, :: self.hop_length]
-
-        if return_tensors is not None:
-            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
-
-        return padded_inputs
-
-    def to_dict(self) -> dict[str, Any]:
-        """
-        Serializes this instance to a Python dictionary. Returns:
-            `dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["feature_extractor_type"] = self.__class__.__name__
-        if "mel_filters" in output:
-            del output["mel_filters"]
-        if "window" in output:
-            del output["window"]
-        if "chroma_filters" in output:
-            del output["chroma_filters"]
-        if "spectrogram" in output:
-            del output["spectrogram"]
-        return output
 
+MusicgenMelodyFeatureExtractor = deprecated_feature_extractor(
+    MusicgenMelodyAudioProcessor, "MusicgenMelodyFeatureExtractor"
+)
 
 __all__ = ["MusicgenMelodyFeatureExtractor"]
diff --git a/src/transformers/models/parakeet/audio_processing_parakeet.py b/src/transformers/models/parakeet/audio_processing_parakeet.py
new file mode 100644
index 000000000000..5df813fabae5
--- /dev/null
+++ b/src/transformers/models/parakeet/audio_processing_parakeet.py
@@ -0,0 +1,149 @@
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...audio_processing_backends import TorchAudioBackend
+from ...audio_utils import MelScaleConfig, SpectrogramConfig, StftConfig
+
+
+class ParakeetAudioProcessor(TorchAudioBackend):
+    sample_rate = 16000
+    force_mono = True
+    spectrogram_config = SpectrogramConfig(
+        stft_config=StftConfig(
+            n_fft=512,
+            hop_length=160,
+            win_length=400,
+            window_fn="hann_window",
+            power=2.0,
+            pad_mode="constant",
+            periodic=False,
+        ),
+        mel_scale_config=MelScaleConfig(
+            n_mels=80,
+            f_min=0.0,
+            norm="slaney",
+            mel_scale="slaney",
+        ),
+        preemphasis=0.97,
+        log_mode="log",
+        mel_floor=2**-24,
+    )
+
+    def _mel_filter_bank(self, spectrogram_config):
+        """Compute mel filters via numpy for exact numerical match with the feature extractor.
+
+        The FE uses librosa which accumulates into a float32 array per-band.
+        Replicating that truncation pattern is needed for bit-exact results.
+        """
+        import numpy as np
+        import torch
+
+        from ...audio_utils import hertz_to_mel, mel_to_hertz
+
+        stft_cfg = spectrogram_config.stft_config
+        mel_cfg = spectrogram_config.mel_scale_config
+        n_fft = stft_cfg.n_fft
+        n_mels = mel_cfg.n_mels
+        f_min = mel_cfg.f_min
+        f_max = mel_cfg.f_max if mel_cfg.f_max is not None else self.sample_rate / 2
+
+        mel_min = hertz_to_mel(f_min, mel_scale=mel_cfg.mel_scale)
+        mel_max = hertz_to_mel(f_max, mel_scale=mel_cfg.mel_scale)
+        mel_pts = np.linspace(mel_min, mel_max, n_mels + 2)
+        filter_freqs = mel_to_hertz(mel_pts.copy(), mel_scale=mel_cfg.mel_scale)
+        fft_freqs = np.linspace(0, self.sample_rate / 2, 1 + n_fft // 2)
+
+        fdiff = np.diff(filter_freqs)
+        ramps = np.subtract.outer(filter_freqs, fft_freqs)
+
+        # Accumulate into f32 per-band to match librosa's truncation pattern
+        weights = np.zeros((n_mels, 1 + n_fft // 2), dtype=np.float32)
+        for i in range(n_mels):
+            lower = -ramps[i] / fdiff[i]
+            upper = ramps[i + 2] / fdiff[i + 1]
+            weights[i] = np.maximum(0, np.minimum(lower, upper))
+
+        if mel_cfg.norm == "slaney":
+            enorm = 2.0 / (filter_freqs[2 : n_mels + 2] - filter_freqs[:n_mels])
+            weights *= enorm[:, np.newaxis]
+
+        return torch.from_numpy(weights.T).to(torch.float32)
+
+    def _compute_magnitudes(self, stft_out, power, spectrogram_config=None):
+        import torch
+
+        magnitudes = torch.view_as_real(stft_out)
+        magnitudes = torch.sqrt(magnitudes.pow(2).sum(-1))
+        if power != 1.0:
+            magnitudes = magnitudes.pow(power)
+        return magnitudes
+
+    def _needs_manual_framing(self, spectrogram_config):
+        # Preemphasis is handled waveform-level in _stft; no per-frame processing needed.
+        return spectrogram_config.remove_dc_offset or spectrogram_config.stft_config.left_align_fft
+
+    def _stft(self, audio, *, spectrogram_config, audio_ranges=None, **kwargs):
+        import torch
+
+        audio_lengths = torch.tensor(
+            [end - start for start, end in audio_ranges], device=audio.device
+        ) if audio_ranges is not None else None
+
+        # Waveform-level preemphasis with masking to zero out padding
+        preemphasis = spectrogram_config.preemphasis
+        if preemphasis is not None:
+            audio = torch.cat(
+                [audio[:, :1], audio[:, 1:] - preemphasis * audio[:, :-1]], dim=1
+            )
+            if audio_lengths is not None:
+                timemask = torch.arange(audio.shape[-1], device=audio.device).unsqueeze(0) < audio_lengths.unsqueeze(1)
+                audio = audio.masked_fill(~timemask, 0.0)
+
+        return super()._stft(audio, spectrogram_config=spectrogram_config, **kwargs)
+
+    def _apply_mel_scale(self, features, *, spectrogram_config, **kwargs):
+        import torch
+
+        return torch.matmul(self.mel_filters.T, features)
+
+    def _normalize_magnitude(self, features, *, spectrogram_config, audio_ranges=None, **kwargs):
+        import torch
+
+        # Match FE: log(mel_spec + guard_value) instead of log(clamp(mel_spec, guard_value))
+        features = torch.log(features + spectrogram_config.mel_floor)
+
+        # (batch, mels, frames) -> (batch, frames, mels)
+        features = features.permute(0, 2, 1)
+
+        # Per-utterance normalization
+        if audio_ranges is not None:
+            stft_cfg = spectrogram_config.stft_config
+            audio_lengths = torch.tensor([end - start for start, end in audio_ranges])
+            features_lengths = torch.floor_divide(
+                audio_lengths + stft_cfg.n_fft // 2 * 2 - stft_cfg.n_fft, stft_cfg.hop_length
+            )
+            attention_mask = torch.arange(features.shape[1])[None, :] < features_lengths[:, None]
+            mask = attention_mask.unsqueeze(-1)
+            mel_masked = features * mask
+            mean = mel_masked.sum(dim=1) / features_lengths.unsqueeze(-1)
+            mean = mean.unsqueeze(1)
+            variance = ((mel_masked - mean) ** 2 * mask).sum(dim=1) / (features_lengths - 1).unsqueeze(-1)
+            std = torch.sqrt(variance).unsqueeze(1)
+            features = (features - mean) / (std + 1e-5)
+            features *= mask
+
+        return features
+
+
+__all__ = ["ParakeetAudioProcessor"]
diff --git a/src/transformers/models/parakeet/feature_extraction_parakeet.py b/src/transformers/models/parakeet/feature_extraction_parakeet.py
index c745d02c9629..92f02cd0a9f4 100644
--- a/src/transformers/models/parakeet/feature_extraction_parakeet.py
+++ b/src/transformers/models/parakeet/feature_extraction_parakeet.py
@@ -11,275 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from ...utils.deprecation import deprecated_feature_extractor
+from .audio_processing_parakeet import ParakeetAudioProcessor
 
-import numpy as np
-import torch
-
-from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
-from ...feature_extraction_utils import BatchFeature
-from ...utils import TensorType, is_librosa_available, logging
-from ...utils.import_utils import requires
-
-
-if is_librosa_available():
-    import librosa
-
-
-EPSILON = 1e-5
-LOG_ZERO_GUARD_VALUE = 2**-24
-
-
-logger = logging.get_logger(__name__)
-
-
-@requires(backends=("torch", "librosa"))
-class ParakeetFeatureExtractor(SequenceFeatureExtractor):
-    r"""
-    Constructs a Parakeet feature extractor.
-
-    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
-    most of the main methods. Users should refer to this superclass for more information regarding those methods.
-
-    This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the `Short Time
-    Fourier Transform` which should match pytorch's `torch.stft` equivalent.
-
-    Args:
-        feature_size (`int`, *optional*, defaults to 80):
-            The feature dimension of the extracted features.
-        sampling_rate (`int`, *optional*, defaults to 16000):
-            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
-        hop_length (`int`, *optional*, defaults to 160):
-            Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
-        n_fft (`int`, *optional*, defaults to 512):
-            Size of the Fourier transform.
-        win_length (`int`, *optional*, defaults to 400):
-            The window length for the STFT computation.
-        preemphasis (`float`, *optional*, defaults to 0.97):
-            A preemphasis filter coefficient. 0.0 means no preemphasis filter.
-        padding_value (`float`, *optional*, defaults to 0.0):
-            Padding value used to pad the audio. Should correspond to silences.
-    """
-
-    model_input_names = ["input_features", "attention_mask"]
-
-    def __init__(
-        self,
-        feature_size=80,
-        sampling_rate=16000,
-        hop_length=160,
-        n_fft=512,
-        win_length=400,
-        preemphasis=0.97,
-        padding_value=0.0,
-        **kwargs,
-    ):
-        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
-
-        self.hop_length = hop_length
-        self.n_fft = n_fft
-        self.win_length = win_length
-        self.preemphasis = preemphasis
-
-        # TODO: @eustlb, for now we use librosa to compute the mel filters
-        # indeed mel_filter_bank uses np.float64 (while librosa uses np.float32), giving numerical differences
-        # self.mel_filters = mel_filter_bank(
-        #     num_frequency_bins=n_fft // 2 + 1,
-        #     num_mel_filters=feature_size,
-        #     min_frequency=0.0,
-        #     max_frequency=sampling_rate / 2,
-        #     sampling_rate=sampling_rate,
-        #     norm="slaney",
-        #     mel_scale="slaney",
-        # )
-        mel_filters = librosa.filters.mel(
-            sr=sampling_rate, n_fft=n_fft, n_mels=feature_size, fmin=0.0, fmax=sampling_rate / 2, norm="slaney"
-        )
-        self.mel_filters = torch.from_numpy(mel_filters).to(torch.float32)
-
-    def _torch_extract_fbank_features(self, waveform, device="cpu"):
-        # spectrogram
-        window = torch.hann_window(self.win_length, periodic=False, device=device)
-        stft = torch.stft(
-            waveform,
-            self.n_fft,
-            hop_length=self.hop_length,
-            win_length=self.win_length,
-            window=window,
-            return_complex=True,
-            pad_mode="constant",
-        )
-        # Let's math original implementation
-        # magnitudes = torch.abs(stft) ** 2
-        magnitudes = torch.view_as_real(stft)
-        magnitudes = torch.sqrt(magnitudes.pow(2).sum(-1))
-        magnitudes = magnitudes.pow(2)
-
-        # log mel spectrogram
-        mel_filters = self.mel_filters.to(device)
-        mel_spec = mel_filters @ magnitudes
-        mel_spec = torch.log(mel_spec + LOG_ZERO_GUARD_VALUE)
-
-        # (batch_size, num_mel_filters, num_frames) -> (batch_size, num_frames, num_mel_filters)
-        mel_spec = mel_spec.permute(0, 2, 1)
-
-        return mel_spec
-
-    def __call__(
-        self,
-        raw_speech: np.ndarray | list[float] | list[np.ndarray] | list[list[float]],
-        truncation: bool = False,
-        pad_to_multiple_of: int | None = None,
-        return_tensors: str | TensorType | None = None,
-        return_attention_mask: bool | None = None,
-        padding: str | None = "longest",
-        max_length: int | None = None,
-        sampling_rate: int | None = None,
-        do_normalize: bool | None = None,
-        device: str | None = "cpu",
-        return_token_timestamps: bool | None = None,
-        **kwargs,
-    ) -> BatchFeature:
-        """
-        Main method to featurize and prepare for the model one or several sequence(s). Implementation uses PyTorch for
-        the STFT computation if available, otherwise a slower NumPy based one.
-
-        Args:
-            raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
-                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
-                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
-                stereo, i.e. single float per timestep.
-            truncation (`bool`, *optional*, default to `True`):
-                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
-            pad_to_multiple_of (`int`, *optional*, defaults to None):
-                If set will pad the sequence to a multiple of the provided value.
-
-                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
-                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
-            return_attention_mask (`bool`, *optional*):
-                Whether to return the attention mask. If left to the default, will return the attention mask according
-                to the specific feature_extractor's default.
-
-                [What are attention masks?](../glossary#attention-mask)
-
-                <Tip>
-
-                For Parakeet models, `attention_mask` should always be passed for batched inference, to avoid subtle
-                bugs.
-
-                </Tip>
-
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors instead of list of python integers. Acceptable values are:
-
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return Numpy `np.ndarray` objects.
-            sampling_rate (`int`, *optional*):
-                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
-                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
-                pipeline.
-            padding_value (`float`, *optional*, defaults to 0.0):
-                The value that is used to fill the padding values / vectors.
-            do_normalize (`bool`, *optional*, defaults to `False`):
-                Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
-                improve the performance of the model.
-            device (`str`, *optional*, defaults to `'cpu'`):
-                Specifies the device for computation of the log-mel spectrogram of audio signals in the
-                `_torch_extract_fbank_features` method. (e.g., "cpu", "cuda")
-            return_token_timestamps (`bool`, *optional*, defaults to `None`):
-                Deprecated. Use `return_attention_mask` instead from which the number of frames can be inferred.
-
-                Whether or not to return the number of frames of the input raw_speech.
-                These num_frames can be used by the model to compute word level timestamps.
-        """
-        if sampling_rate is not None:
-            if sampling_rate != self.sampling_rate:
-                raise ValueError(
-                    f"The model corresponding to this feature extractor: {self.__class__.__name__} was trained using a"
-                    f" sampling rate of {self.sampling_rate}. Please make sure that the provided `raw_speech` input"
-                    f" was sampled with {self.sampling_rate} and not {sampling_rate}."
-                )
-        else:
-            logger.warning(
-                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
-                "Failing to do so can result in silent errors that might be hard to debug."
-            )
-
-        # Convert to torch tensor
-        if isinstance(raw_speech, np.ndarray):
-            raw_speech = torch.tensor(raw_speech)
-        elif isinstance(raw_speech, (list, tuple)) and isinstance(raw_speech[0], np.ndarray):
-            raw_speech = [torch.tensor(speech) for speech in raw_speech]
-
-        is_batched_torch = isinstance(raw_speech, torch.Tensor) and len(raw_speech.shape) > 1
-        if is_batched_torch and len(raw_speech.shape) > 2:
-            logger.warning(
-                f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
-                "We will take the mean of the channels to convert to mono."
-            )
-            raw_speech = raw_speech.mean(-1)
-
-        is_batched_sequence = isinstance(raw_speech, (list, tuple))
-        if is_batched_sequence:
-            for speech in raw_speech:
-                if len(speech.shape) > 1:
-                    logger.warning(
-                        f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
-                        "We will take the mean of the channels to convert to mono."
-                    )
-                    speech = speech.mean(-1)
-
-        if is_batched_torch or is_batched_sequence:
-            raw_speech = [speech[:, None].to(torch.float32) for speech in raw_speech]
-        else:
-            raw_speech = [raw_speech[:, None].to(torch.float32)]
-
-        audio_lengths = [len(speech) for speech in raw_speech]
-        batched_speech = BatchFeature({"input_features": raw_speech, "audio_lengths": audio_lengths})
-
-        padded_inputs = self.pad(
-            batched_speech,
-            padding=padding,
-            max_length=max_length,
-            truncation=truncation,
-            pad_to_multiple_of=pad_to_multiple_of,
-            return_tensors="pt",
-        )
-        input_features = padded_inputs.input_features.squeeze(-1)
-
-        # preemphasis
-        if self.preemphasis is not None:
-            timemask = torch.arange(input_features.shape[1], device=input_features.device).unsqueeze(
-                0
-            ) < padded_inputs.audio_lengths.unsqueeze(1)
-            input_features = torch.cat(
-                [input_features[:, :1], input_features[:, 1:] - self.preemphasis * input_features[:, :-1]], dim=1
-            )
-            input_features = input_features.masked_fill(~timemask, 0.0)
-
-        input_features = self._torch_extract_fbank_features(input_features, device)
-        features_lengths = torch.floor_divide(
-            padded_inputs.audio_lengths + self.n_fft // 2 * 2 - self.n_fft, self.hop_length
-        )
-        attention_mask = torch.arange(input_features.shape[1], device=device)[None, :] < features_lengths[:, None]
-
-        # normalize mel features, ignoring padding
-        mask = attention_mask.unsqueeze(-1)
-        input_features_masked = input_features * mask
-        mean = input_features_masked.sum(dim=1) / features_lengths.unsqueeze(-1)
-        mean = mean.unsqueeze(1)
-        variance = ((input_features_masked - mean) ** 2 * mask).sum(dim=1) / (features_lengths - 1).unsqueeze(-1)
-        std = torch.sqrt(variance).unsqueeze(1)
-        input_features = (input_features - mean) / (std + EPSILON)
-        input_features *= mask
-
-        return BatchFeature(
-            data={
-                "input_features": input_features,
-                "attention_mask": attention_mask,
-            },
-            tensor_type=return_tensors,
-        )
 
+ParakeetFeatureExtractor = deprecated_feature_extractor(ParakeetAudioProcessor, "ParakeetFeatureExtractor")
 
 __all__ = ["ParakeetFeatureExtractor"]
diff --git a/src/transformers/models/pe_audio/audio_processing_pe_audio.py b/src/transformers/models/pe_audio/audio_processing_pe_audio.py
new file mode 100644
index 000000000000..1c8969b28ed2
--- /dev/null
+++ b/src/transformers/models/pe_audio/audio_processing_pe_audio.py
@@ -0,0 +1,23 @@
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...audio_processing_backends import NumpyAudioBackend
+
+
+class PeAudioAudioProcessor(NumpyAudioBackend):
+    sample_rate = 16000
+    force_mono = True
+
+
+__all__ = ["PeAudioAudioProcessor"]
diff --git a/src/transformers/models/pe_audio/feature_extraction_pe_audio.py b/src/transformers/models/pe_audio/feature_extraction_pe_audio.py
index a7738d3089ac..da1f7d34a86f 100644
--- a/src/transformers/models/pe_audio/feature_extraction_pe_audio.py
+++ b/src/transformers/models/pe_audio/feature_extraction_pe_audio.py
@@ -11,150 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from ...utils.deprecation import deprecated_feature_extractor
+from .audio_processing_pe_audio import PeAudioAudioProcessor
 
-import numpy as np
-
-from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
-from ...feature_extraction_utils import BatchFeature
-from ...processing_utils import load_audio
-from ...utils import PaddingStrategy, TensorType, logging
-
-
-logger = logging.get_logger(__name__)
-
-
-class PeAudioFeatureExtractor(SequenceFeatureExtractor):
-    r"""
-    Constructs a PeAudioFeatureExtractor feature extractor.
-
-    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
-    most of the main methods. Users should refer to this superclass for more information regarding those methods.
-
-    Args:
-        feature_size (`int`, *optional*, defaults to 1):
-            The feature dimension of the extracted features. Use 1 for mono, 2 for stereo.
-        sampling_rate (`int`, *optional*, defaults to 48000):
-            The sampling rate at which the audio waveform should be digitalized, expressed in hertz (Hz).
-        padding_value (`float`, *optional*, defaults to 0.0):
-            The value that is used for padding.
-        hop_length (`int`, *optional*, defaults to 1920):
-            Overlap length between successive windows.
-    """
-
-    model_input_names = ["input_values"]
-
-    def __init__(
-        self,
-        feature_size: int = 1,
-        sampling_rate: int = 48_000,
-        padding_value: float = 0.0,
-        hop_length: int = 1920,
-        **kwargs,
-    ):
-        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
-        self.hop_length = hop_length
-
-    def _reflect_pad(self, wav):
-        if len(wav) % self.hop_length == 0:
-            return wav
-        p1d = (0, self.hop_length - (len(wav) % self.hop_length))
-        return np.pad(wav, p1d, "reflect")
-
-    def __call__(
-        self,
-        raw_audio: np.ndarray | list[float] | list[np.ndarray] | list[list[float]] | str | list[str],
-        padding: bool | str | PaddingStrategy | None = None,
-        truncation: bool | None = False,
-        max_length: int | None = None,
-        return_tensors: str | TensorType | None = None,
-        sampling_rate: int | None = None,
-    ) -> BatchFeature:
-        from_file = False
-        if isinstance(raw_audio, str):
-            raw_audio = [raw_audio]
-
-        if isinstance(raw_audio, (list, tuple)) and isinstance(raw_audio[0], str):
-            loaded = []
-            for audio_file in raw_audio:
-                loaded.append(load_audio(audio_file, self.sampling_rate))
-            raw_audio = loaded
-            from_file = True
-
-        if sampling_rate is not None:
-            if sampling_rate != self.sampling_rate:
-                raise ValueError(
-                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
-                    f" {self.sampling_rate}. Please make sure that the provided audio input was sampled with"
-                    f" {self.sampling_rate} and not {sampling_rate}."
-                )
-        elif not from_file:
-            logger.warning(
-                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
-                "Failing to do so can result in silent errors that might be hard to debug."
-            )
-
-        if padding and truncation:
-            raise ValueError("Both padding and truncation were set. Make sure you only set one.")
-        elif padding is None:
-            # by default let's pad the inputs
-            padding = True
-
-        is_batched = bool(
-            isinstance(raw_audio, (list, tuple)) and (isinstance(raw_audio[0], (np.ndarray, tuple, list)))
-        )
-
-        if is_batched:
-            raw_audio = [np.asarray(audio, dtype=np.float32).T for audio in raw_audio]
-        elif not is_batched and not isinstance(raw_audio, np.ndarray):
-            raw_audio = np.asarray(raw_audio, dtype=np.float32)
-        elif isinstance(raw_audio, np.ndarray) and raw_audio.dtype is np.dtype(np.float64):
-            raw_audio = raw_audio.astype(np.float32)
-
-        # always return batch
-        if not is_batched:
-            raw_audio = [np.asarray(raw_audio).T]
-
-        if isinstance(raw_audio, list):
-            raw_audio = [self._reflect_pad(x) for x in raw_audio]
-        else:
-            raw_audio = self._reflect_pad(raw_audio)
-
-        # verify inputs are valid
-        for example in raw_audio:
-            if example.ndim > 2:
-                raise ValueError(f"Expected input shape (channels, length) but got shape {example.shape}")
-            if self.feature_size == 1 and example.ndim != 1:
-                raise ValueError(f"Expected mono audio but example has {example.shape[-1]} channels")
-            if self.feature_size == 2:
-                raise ValueError("Stereo audio isn't supported for now")
-
-        input_values = BatchFeature({"input_values": raw_audio})
-
-        # normal padding on batch
-        padded_inputs = self.pad(
-            input_values,
-            max_length=max_length,
-            truncation=truncation,
-            padding=padding,
-            return_attention_mask=padding,
-            pad_to_multiple_of=self.hop_length,
-        )
-        if padding:
-            padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask")
-        if padding:
-            padded_inputs.input_values = padded_inputs.input_values[:, np.newaxis, :]
-
-        input_values = []
-        for example in padded_inputs.pop("input_values"):
-            if self.feature_size == 1:
-                example = example[..., None]
-            input_values.append(example.T)
-
-        padded_inputs["input_values"] = input_values
-        if return_tensors is not None:
-            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
-
-        return padded_inputs
 
+PeAudioFeatureExtractor = deprecated_feature_extractor(PeAudioAudioProcessor, "PeAudioFeatureExtractor")
 
 __all__ = ["PeAudioFeatureExtractor"]
diff --git a/src/transformers/models/phi4_multimodal/audio_processing_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/audio_processing_phi4_multimodal.py
new file mode 100644
index 000000000000..a63321c9a346
--- /dev/null
+++ b/src/transformers/models/phi4_multimodal/audio_processing_phi4_multimodal.py
@@ -0,0 +1,127 @@
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from ...audio_processing_backends import TorchAudioBackend
+from ...audio_utils import MelScaleConfig, SpectrogramConfig, StftConfig, mel_filter_bank
+
+
+class Phi4MultimodalAudioProcessor(TorchAudioBackend):
+    sample_rate = 16000
+    force_mono = True
+    audio_compression_rate = 8
+    audio_downsample_rate = 1
+    audio_feat_stride = 1
+    spectrogram_config = SpectrogramConfig(
+        stft_config=StftConfig(
+            n_fft=512,
+            win_length=400,
+            hop_length=160,
+            window_fn="hamming_window",
+            periodic=False,
+            center=False,
+            power=2.0,
+            window_dtype="float64",
+        ),
+        preemphasis=0.97,
+        mel_scale_config=MelScaleConfig(
+            n_mels=80,
+            f_min=0,
+            f_max=7690,
+            mel_scale="kaldi",
+            triangularize_in_mel_space=True,
+            matmul_order="features_first",
+        ),
+        mel_floor=1.0,
+        log_mode="log",
+    )
+
+    def _mel_filter_bank(self, spectrogram_config):
+        stft_cfg = spectrogram_config.stft_config
+        mel_cfg = spectrogram_config.mel_scale_config
+        mel_filters_np = mel_filter_bank(
+            num_frequency_bins=1 + stft_cfg.n_fft // 2,
+            num_mel_filters=mel_cfg.n_mels,
+            min_frequency=mel_cfg.f_min,
+            max_frequency=mel_cfg.f_max if mel_cfg.f_max is not None else self.sample_rate / 2,
+            sampling_rate=self.sample_rate,
+            norm=mel_cfg.norm,
+            mel_scale=mel_cfg.mel_scale,
+            triangularize_in_mel_space=mel_cfg.triangularize_in_mel_space,
+        )
+        return torch.from_numpy(mel_filters_np).to(torch.float32)
+
+    def _apply_frame_processing(self, frames, *, spectrogram_config, audio_ranges=None, **kwargs):
+        # Mask frames that overlap the boundary between real audio and padding
+        stft_cfg = spectrogram_config.stft_config
+        win_length = stft_cfg.win_length or stft_cfg.n_fft
+        hop_length = stft_cfg.hop_length or win_length // 2
+        batch_size = frames.shape[0]
+
+        if audio_ranges is not None and batch_size > 1:
+            audio_lengths_t = torch.tensor([end - start for start, end in audio_ranges])
+            to_mask_idxs = torch.arange(batch_size)[audio_lengths_t != audio_lengths_t.max()]
+            if to_mask_idxs.numel() > 0:
+                frames = frames.clone()
+                down = (audio_lengths_t[to_mask_idxs] - win_length) // hop_length + 1
+                up = audio_lengths_t[to_mask_idxs] // hop_length - 1
+                offset = down.min()
+                max_idx = up.max()
+
+                mask_range = torch.arange(max_idx - offset).expand(to_mask_idxs.shape[0], -1)
+                mask = ((down - offset).unsqueeze(1) <= mask_range) & (mask_range < (up - offset).unsqueeze(1))
+                mask = mask.unsqueeze(-1).expand(-1, -1, win_length)
+
+                masked_frames = frames[to_mask_idxs, offset:max_idx].masked_fill_(mask, 0)
+                frames[to_mask_idxs, offset:max_idx] = masked_frames
+
+        frames_prev = torch.roll(frames, 1, dims=-1)
+        frames_prev[..., 0] = frames_prev[..., 1]
+        return (frames - spectrogram_config.preemphasis * frames_prev) * 32768
+
+    def _window_and_fft(self, frames, window, frame_length, n_fft, stft_cfg, audio_dtype=None):
+        frames = frames * window
+        if frame_length < n_fft:
+            frames = torch.nn.functional.pad(frames, (0, n_fft - frame_length))
+        # Cast to complex64 before abs() to match the FE's precision path
+        spec = torch.fft.rfft(frames, n=n_fft).to(torch.complex64)
+        if stft_cfg.normalized:
+            spec = spec / window.pow(2.0).sum().sqrt()
+        return spec.transpose(-2, -1)
+
+    def _get_features_lengths(self, audio_lengths, spectrogram_config, include_center_frame=False):
+        win_length = spectrogram_config.stft_config.win_length or spectrogram_config.stft_config.n_fft
+        hop_length = spectrogram_config.stft_config.hop_length or win_length // 2
+        return (audio_lengths - win_length) // hop_length + 1
+
+    def _compute_audio_embed_size(self, audio_frames):
+        integer = audio_frames // self.audio_compression_rate
+        remainder = audio_frames % self.audio_compression_rate
+        result = integer + (remainder > 0).to(integer.dtype)
+
+        integer = result // self.audio_downsample_rate
+        remainder = result % self.audio_downsample_rate
+        result = integer + (remainder > 0).to(integer.dtype)
+
+        return result
+
+    def _postprocess_output(self, output, **kwargs):
+        feature_lengths = output["audio_features_mask"].sum(dim=-1)
+        feature_lengths = feature_lengths * self.audio_feat_stride
+        output["audio_embed_sizes"] = self._compute_audio_embed_size(feature_lengths)
+        return output
+
+
+__all__ = ["Phi4MultimodalAudioProcessor"]
diff --git a/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py
index 9ce98251e50e..78d4727cbccd 100644
--- a/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py
+++ b/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py
@@ -11,271 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from ...utils.deprecation import deprecated_feature_extractor
+from .audio_processing_phi4_multimodal import Phi4MultimodalAudioProcessor
 
-"""
-Processor class for Phi4Multimodal
-"""
-
-import numpy as np
-
-from ...audio_utils import AudioInput, mel_filter_bank
-from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
-from ...image_processing_utils import BatchFeature
-from ...utils import TensorType, is_torch_available, logging
-
-
-if is_torch_available():
-    import torch
-
-
-logger = logging.get_logger(__name__)
-
-
-class Phi4MultimodalFeatureExtractor(SequenceFeatureExtractor):
-    model_input_names = ["audio_input_features", "audio_embed_sizes", "audio_attention_mask"]
-
-    def __init__(
-        self,
-        feature_size: int = 80,
-        sampling_rate: int = 16000,
-        hop_length: int = 160,
-        n_fft: int = 512,
-        win_length: int = 400,
-        preemphasis: float = 0.97,
-        padding_value: float = 0.0,
-        audio_compression_rate: int = 8,
-        audio_downsample_rate: int = 1,
-        audio_feat_stride: int = 1,
-        mel_min_frequency: float = 0,
-        mel_max_frequency: float = 7690,
-        **kwargs,
-    ):
-        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
-
-        self.hop_length = hop_length
-        self.n_fft = n_fft
-        self.win_length = win_length
-        self.preemphasis = preemphasis
-        self.padding_value = padding_value
-        self.audio_compression_rate = audio_compression_rate
-        self.audio_downsample_rate = audio_downsample_rate
-        self.audio_feat_stride = audio_feat_stride
-
-        self.mel_filters = mel_filter_bank(
-            num_frequency_bins=self.n_fft // 2 + 1,
-            num_mel_filters=self.feature_size,
-            min_frequency=mel_min_frequency,
-            max_frequency=mel_max_frequency,
-            sampling_rate=self.sampling_rate,
-            triangularize_in_mel_space=True,
-            mel_scale="kaldi",
-        )
-
-    def __call__(
-        self,
-        raw_speech: AudioInput,
-        sampling_rate: int | None = None,
-        pad_to_multiple_of: int | None = None,
-        padding: str | None = "longest",
-        max_length: int | None = None,
-        truncation: bool = False,
-        return_tensors: str | TensorType | None = None,
-        return_attention_mask: bool | None = True,
-        device: str | None = "cpu",
-        **kwargs,
-    ) -> BatchFeature:
-        """
-        Main method to featurize and prepare for the model one or several audio sequence(s). Implementation uses PyTorch for
-        the STFT computation if available, otherwise a slower NumPy based one.
-
-        Args:
-            raw_speech (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The sequence or batch of sequences to be processed. Each sequence can be a numpy array or PyTorch tensor.
-                For batched inputs, sequences can be a list of numpy arrays or PyTorch tensors, or a single numpy array or
-                PyTorch tensor with first dimension being the batch size.
-            sampling_rate (`int`, *optional*):
-                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
-                `sampling_rate` at the forward call to prevent silent errors.
-            pad_to_multiple_of (`int`, *optional*, defaults to None):
-                If set will pad the sequence to a multiple of the provided value.
-            padding (`str`, *optional*, defaults to "longest"):
-                Padding strategy. Can be "longest" to pad to the longest sequence in the batch, or a specific length.
-            max_length (`int`, *optional*):
-                Maximum length of the returned list and optionally padding length.
-            truncation (`bool`, *optional*, defaults to False):
-                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors instead of numpy arrays. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return Numpy `np.ndarray` objects.
-            return_attention_mask (`bool`, *optional*, defaults to `True`):
-                Whether to return the extracted audio input features' attention mask.
-            device (`str`, *optional*, defaults to "cpu"):
-                Specifies the device for computation of the audio features. (e.g., "cpu", "cuda")
-
-        Returns:
-            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
-                - **audio_input_features** -- Audio features extracted from the raw audio input, shape (batch_size, max_feature_length, feature_size).
-                - **audio_lengths** -- Length of each audio sample in the batch, shape (batch_size,).
-                - **audio_attention_mask** -- Attention mask for the audio input, shape (batch_size, max_feature_length).
-                If `return_tensors` is not specified, the fields will be PyTorch tensors if PyTorch is available, otherwise NumPy arrays.
-        """
-        if sampling_rate is not None:
-            if sampling_rate != self.sampling_rate:
-                raise ValueError(
-                    f"The model corresponding to this feature extractor: {self.__class__.__name__} was trained using a"
-                    f" sampling rate of {self.sampling_rate}. Please make sure that the provided `raw_speech` input"
-                    f" was sampled with {self.sampling_rate} and not {sampling_rate}."
-                )
-        else:
-            logger.warning(
-                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
-                "Failing to do so can result in silent errors that might be hard to debug."
-            )
-
-        # Convert to torch tensor
-        if isinstance(raw_speech, np.ndarray):
-            raw_speech = torch.tensor(raw_speech)
-        elif isinstance(raw_speech, (list, tuple)) and isinstance(raw_speech[0], np.ndarray):
-            raw_speech = [torch.tensor(speech) for speech in raw_speech]
-
-        is_batched_torch = isinstance(raw_speech, torch.Tensor) and len(raw_speech.shape) > 1
-        if is_batched_torch and len(raw_speech.shape) > 2:
-            logger.warning(
-                f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
-                "We will take the mean of the channels to convert to mono."
-            )
-            raw_speech = raw_speech.mean(-1)
-
-        is_batched_sequence = isinstance(raw_speech, (list, tuple))
-        if is_batched_sequence:
-            for speech in raw_speech:
-                if len(speech.shape) > 1:
-                    logger.warning(
-                        f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
-                        "We will take the mean of the channels to convert to mono."
-                    )
-                    speech = speech.mean(-1)
-
-        if is_batched_torch or is_batched_sequence:
-            raw_speech = [speech[:, None].to(torch.float32) for speech in raw_speech]
-        else:
-            raw_speech = [raw_speech[:, None].to(torch.float32)]
-
-        audio_lengths = [len(speech) for speech in raw_speech]
-
-        # convert into correct format for padding
-        batched_speech = BatchFeature(data={"audio_input_features": raw_speech, "audio_lengths": audio_lengths})
-        padded_inputs = self.pad(
-            batched_speech,
-            padding=padding,
-            max_length=max_length,
-            truncation=truncation,
-            pad_to_multiple_of=pad_to_multiple_of,
-            return_tensors="pt",
-        )
-        input_features = padded_inputs.audio_input_features.squeeze(-1)
-        audio_lengths = padded_inputs.audio_lengths
-
-        input_features = self._torch_extract_fbank_features(input_features, audio_lengths, device)
-
-        feature_lengths = (audio_lengths - self.win_length) // self.hop_length + 1
-        feature_lengths = feature_lengths * self.audio_feat_stride
-        audio_embed_sizes = self._compute_audio_embed_size(feature_lengths)
-
-        feature_attention_mask = (
-            torch.arange(0, feature_lengths.max()) if is_torch_available() else np.arange(0, feature_lengths.max())
-        )
-        feature_attention_mask = (
-            feature_attention_mask[None, :] < feature_lengths[:, None] if len(feature_lengths) > 1 else None
-        )
-
-        data = {
-            "audio_input_features": input_features,
-            "audio_embed_sizes": audio_embed_sizes,
-        }
-        if feature_attention_mask is not None and return_attention_mask:
-            data["audio_attention_mask"] = feature_attention_mask
-
-        return BatchFeature(data=data, tensor_type=return_tensors)
-
-    # TODO; @eustlb, move this to audio_utils in a general spectogram_batch function that handles torch and numpy
-    def _torch_extract_fbank_features(
-        self, waveform: "torch.FloatTensor", audio_lengths: "torch.Tensor", device: str = "cpu"
-    ) -> "torch.FloatTensor":
-        """
-        Compute the log mel-scaled spectrogram of batched waveforms using PyTorch's FFT implementation.
-
-        Args:
-            waveform (torch.FloatTensor` of shape `(batch_size, max_audio_length)`):
-                The batched waveforms.
-            audio_lengths (`torch.Tensor` of shape `(batch_size,)`):
-                The lengths of the waveforms along the max_audio_length dimension.
-            device (`str`, *optional*, defaults to "cpu"):
-                The device to run the computation on. (e.g., "cpu", "cuda")
-
-        Returns:
-            `torch.FloatTensor` of shape `(batch_size, max_feature_length, feature_size)`:
-                The log mel-scaled spectrogram of the batched waveforms.
-        """
-        fft_window = torch.hamming_window(self.win_length, periodic=False, device=device, dtype=torch.float64)
-
-        # batched implementation
-        batch_size = waveform.shape[0]
-        frames = waveform.unfold(-1, self.win_length, self.hop_length)
-
-        # ---
-        # the unbatched (and unpaded) original implementation skips last few audio values that can't be included in a frame
-        # we need to ensure that the corresponding frames for the padded input also mask these values
-        if batch_size > 1:
-            frames = frames.clone()
-            # concerned batch indices
-            to_mask_batch_idxs = torch.arange(batch_size)[audio_lengths != audio_lengths.max()]
-            if to_mask_batch_idxs.numel() > 0:
-                batch_idxs_down = (audio_lengths[to_mask_batch_idxs] - self.win_length) // self.hop_length + 1
-                batch_idxs_up = (audio_lengths[to_mask_batch_idxs] // self.hop_length) - 1
-                offset_idx = batch_idxs_down.min()
-                max_idx = batch_idxs_up.max()
-
-                mask = torch.arange(max_idx - offset_idx, device=device).expand(to_mask_batch_idxs.shape[0], -1)
-                mask = ((batch_idxs_down - offset_idx).unsqueeze(1) <= mask) & (
-                    mask < (batch_idxs_up - offset_idx).unsqueeze(1)
-                )
-                mask = mask.unsqueeze(-1).expand(-1, -1, self.win_length)
-                masked_frames = frames[to_mask_batch_idxs, offset_idx:max_idx].masked_fill_(mask, 0)
-                frames[to_mask_batch_idxs, offset_idx:max_idx] = masked_frames
-        # ---
-
-        # apply pre-emphasis first order filter on fft windows
-        frames_prev = torch.roll(frames, 1, dims=-1)
-        frames_prev[:, :, 0] = frames_prev[:, :, 1]
-        frames = (frames - self.preemphasis * frames_prev) * 32768
-
-        # apply fft
-        S = torch.fft.rfft(fft_window * frames.view(-1, self.win_length), n=self.n_fft, dim=1)
-        S = S.view(frames.shape[0], -1, S.shape[-1])
-        S = S.to(torch.complex64)
-
-        spec = torch.abs(S)
-        spec_power = spec**2
-
-        # apply triangular mel filter bank
-        mel_filters = torch.from_numpy(self.mel_filters).to(device, torch.float32)
-        log_spec = torch.clamp(spec_power @ mel_filters, min=1.0)
-        log_spec = torch.log(log_spec)
-
-        return log_spec
-
-    def _compute_audio_embed_size(self, audio_frames):
-        integer = audio_frames // self.audio_compression_rate
-        remainder = audio_frames % self.audio_compression_rate
-        result = integer + (remainder > 0).to(integer.dtype)
-
-        integer = result // self.audio_downsample_rate
-        remainder = result % self.audio_downsample_rate
-        result = integer + (remainder > 0).to(integer.dtype)  # qformer compression
-
-        return result
 
+Phi4MultimodalFeatureExtractor = deprecated_feature_extractor(
+    Phi4MultimodalAudioProcessor, "Phi4MultimodalFeatureExtractor"
+)
 
 __all__ = ["Phi4MultimodalFeatureExtractor"]
diff --git a/src/transformers/models/pop2piano/audio_processing_pop2piano.py b/src/transformers/models/pop2piano/audio_processing_pop2piano.py
new file mode 100644
index 000000000000..9cd546b15a59
--- /dev/null
+++ b/src/transformers/models/pop2piano/audio_processing_pop2piano.py
@@ -0,0 +1,34 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# NOTE: Full Pop2Piano feature extraction requires the Essentia library for
+# beat detection (RhythmExtractor2013) and scipy for beat interpolation.
+# This audio processor provides the basic mel spectrogram configuration but
+# does not implement the complete beat-aligned segmentation pipeline.
+
+from ...audio_processing_backends import NumpyAudioBackend
+from ...audio_utils import MelScaleConfig, SpectrogramConfig, StftConfig
+
+
+class Pop2PianoAudioProcessor(NumpyAudioBackend):
+    sample_rate = 22050
+    force_mono = True
+    spectrogram_config = SpectrogramConfig(
+        stft_config=StftConfig(n_fft=4096, hop_length=1024, power=2.0),
+        mel_scale_config=MelScaleConfig(n_mels=512, f_min=10.0, mel_scale="htk"),
+        log_mode="log10",
+    )
+
+
+__all__ = ["Pop2PianoAudioProcessor"]
diff --git a/src/transformers/models/pop2piano/feature_extraction_pop2piano.py b/src/transformers/models/pop2piano/feature_extraction_pop2piano.py
index 4e770fcb1b71..3ab91ec37d43 100644
--- a/src/transformers/models/pop2piano/feature_extraction_pop2piano.py
+++ b/src/transformers/models/pop2piano/feature_extraction_pop2piano.py
@@ -11,442 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Feature extractor class for Pop2Piano"""
+from ...utils.deprecation import deprecated_feature_extractor
+from .audio_processing_pop2piano import Pop2PianoAudioProcessor
 
-import warnings
-
-import numpy
-import numpy as np
-
-from ...audio_utils import mel_filter_bank, spectrogram
-from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
-from ...feature_extraction_utils import BatchFeature
-from ...utils import (
-    TensorType,
-    is_essentia_available,
-    is_librosa_available,
-    is_scipy_available,
-    logging,
-    requires_backends,
-)
-from ...utils.import_utils import requires
-
-
-if is_essentia_available():
-    import essentia.standard
-
-if is_librosa_available():
-    import librosa
-
-if is_scipy_available():
-    import scipy
-
-
-logger = logging.get_logger(__name__)
-
-
-@requires(backends=("essentia", "librosa", "scipy", "torch"))
-class Pop2PianoFeatureExtractor(SequenceFeatureExtractor):
-    r"""
-    Constructs a Pop2Piano feature extractor.
-
-    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
-    most of the main methods. Users should refer to this superclass for more information regarding those methods.
-
-    This class extracts rhythm and preprocesses the audio before it is passed to the model. First the audio is passed
-    to `RhythmExtractor2013` algorithm which extracts the beat_times, beat positions and estimates their confidence as
-    well as tempo in bpm, then beat_times is interpolated and to get beatsteps. Later we calculate
-    extrapolated_beatsteps from it to be used in tokenizer. On the other hand audio is resampled to self.sampling_rate
-    and preprocessed and then log mel spectogram is computed from that to be used in our transformer model.
-
-    Args:
-        sampling_rate (`int`, *optional*, defaults to 22050):
-            Target Sampling rate of audio signal. It's the sampling rate that we forward to the model.
-        padding_value (`int`, *optional*, defaults to 0):
-            Padding value used to pad the audio. Should correspond to silences.
-        window_size (`int`, *optional*, defaults to 4096):
-            Length of the window in samples to which the Fourier transform is applied.
-        hop_length (`int`, *optional*, defaults to 1024):
-            Step size between each window of the waveform, in samples.
-        min_frequency (`float`, *optional*, defaults to 10.0):
-            Lowest frequency that will be used in the log-mel spectrogram.
-        feature_size (`int`, *optional*, defaults to 512):
-            The feature dimension of the extracted features.
-        num_bars (`int`, *optional*, defaults to 2):
-            Determines interval between each sequence.
-    """
-
-    model_input_names = ["input_features", "beatsteps", "extrapolated_beatstep"]
-
-    def __init__(
-        self,
-        sampling_rate: int = 22050,
-        padding_value: int = 0,
-        window_size: int = 4096,
-        hop_length: int = 1024,
-        min_frequency: float = 10.0,
-        feature_size: int = 512,
-        num_bars: int = 2,
-        **kwargs,
-    ):
-        super().__init__(
-            feature_size=feature_size,
-            sampling_rate=sampling_rate,
-            padding_value=padding_value,
-            **kwargs,
-        )
-        self.sampling_rate = sampling_rate
-        self.padding_value = padding_value
-        self.window_size = window_size
-        self.hop_length = hop_length
-        self.min_frequency = min_frequency
-        self.feature_size = feature_size
-        self.num_bars = num_bars
-        self.mel_filters = mel_filter_bank(
-            num_frequency_bins=(self.window_size // 2) + 1,
-            num_mel_filters=self.feature_size,
-            min_frequency=self.min_frequency,
-            max_frequency=float(self.sampling_rate // 2),
-            sampling_rate=self.sampling_rate,
-            norm=None,
-            mel_scale="htk",
-        )
-
-    def mel_spectrogram(self, sequence: np.ndarray):
-        """
-        Generates MelSpectrogram.
-
-        Args:
-            sequence (`numpy.ndarray`):
-                The sequence of which the mel-spectrogram will be computed.
-        """
-        mel_specs = []
-        for seq in sequence:
-            window = np.hanning(self.window_size + 1)[:-1]
-            mel_specs.append(
-                spectrogram(
-                    waveform=seq,
-                    window=window,
-                    frame_length=self.window_size,
-                    hop_length=self.hop_length,
-                    power=2.0,
-                    mel_filters=self.mel_filters,
-                )
-            )
-        mel_specs = np.array(mel_specs)
-
-        return mel_specs
-
-    def extract_rhythm(self, audio: np.ndarray):
-        """
-        This algorithm(`RhythmExtractor2013`) extracts the beat positions and estimates their confidence as well as
-        tempo in bpm for an audio signal. For more information please visit
-        https://essentia.upf.edu/reference/std_RhythmExtractor2013.html .
-
-        Args:
-            audio(`numpy.ndarray`):
-                raw audio waveform which is passed to the Rhythm Extractor.
-        """
-        requires_backends(self, ["essentia"])
-        essentia_tracker = essentia.standard.RhythmExtractor2013(method="multifeature")
-        bpm, beat_times, confidence, estimates, essentia_beat_intervals = essentia_tracker(audio)
-
-        return bpm, beat_times, confidence, estimates, essentia_beat_intervals
-
-    def interpolate_beat_times(
-        self, beat_times: numpy.ndarray, steps_per_beat: numpy.ndarray, n_extend: numpy.ndarray
-    ):
-        """
-        This method takes beat_times and then interpolates that using `scipy.interpolate.interp1d` and the output is
-        then used to convert raw audio to log-mel-spectrogram.
-
-        Args:
-            beat_times (`numpy.ndarray`):
-                beat_times is passed into `scipy.interpolate.interp1d` for processing.
-            steps_per_beat (`int`):
-                used as an parameter to control the interpolation.
-            n_extend (`int`):
-                used as an parameter to control the interpolation.
-        """
-
-        requires_backends(self, ["scipy"])
-        beat_times_function = scipy.interpolate.interp1d(
-            np.arange(beat_times.size),
-            beat_times,
-            bounds_error=False,
-            fill_value="extrapolate",
-        )
-
-        ext_beats = beat_times_function(
-            np.linspace(0, beat_times.size + n_extend - 1, beat_times.size * steps_per_beat + n_extend)
-        )
-
-        return ext_beats
-
-    def preprocess_mel(self, audio: np.ndarray, beatstep: np.ndarray):
-        """
-        Preprocessing for log-mel-spectrogram
-
-        Args:
-            audio (`numpy.ndarray` of shape `(audio_length, )` ):
-                Raw audio waveform to be processed.
-            beatstep (`numpy.ndarray`):
-                Interpolated values of the raw audio. If beatstep[0] is greater than 0.0, then it will be shifted by
-                the value at beatstep[0].
-        """
-
-        if audio is not None and len(audio.shape) != 1:
-            raise ValueError(
-                f"Expected `audio` to be a single channel audio input of shape `(n, )` but found shape {audio.shape}."
-            )
-        if beatstep[0] > 0.0:
-            beatstep = beatstep - beatstep[0]
-
-        num_steps = self.num_bars * 4
-        num_target_steps = len(beatstep)
-        extrapolated_beatstep = self.interpolate_beat_times(
-            beat_times=beatstep, steps_per_beat=1, n_extend=(self.num_bars + 1) * 4 + 1
-        )
-
-        sample_indices = []
-        max_feature_length = 0
-        for i in range(0, num_target_steps, num_steps):
-            start_idx = i
-            end_idx = min(i + num_steps, num_target_steps)
-            start_sample = int(extrapolated_beatstep[start_idx] * self.sampling_rate)
-            end_sample = int(extrapolated_beatstep[end_idx] * self.sampling_rate)
-            sample_indices.append((start_sample, end_sample))
-            max_feature_length = max(max_feature_length, end_sample - start_sample)
-        padded_batch = []
-        for start_sample, end_sample in sample_indices:
-            feature = audio[start_sample:end_sample]
-            padded_feature = np.pad(
-                feature,
-                ((0, max_feature_length - feature.shape[0]),),
-                "constant",
-                constant_values=0,
-            )
-            padded_batch.append(padded_feature)
-
-        padded_batch = np.asarray(padded_batch)
-        return padded_batch, extrapolated_beatstep
-
-    def _pad(self, features: np.ndarray, add_zero_line=True):
-        features_shapes = [each_feature.shape for each_feature in features]
-        attention_masks, padded_features = [], []
-        for i, each_feature in enumerate(features):
-            # To pad "input_features".
-            if len(each_feature.shape) == 3:
-                features_pad_value = max([*zip(*features_shapes)][1]) - features_shapes[i][1]
-                attention_mask = np.ones(features_shapes[i][:2], dtype=np.int64)
-                feature_padding = ((0, 0), (0, features_pad_value), (0, 0))
-                attention_mask_padding = (feature_padding[0], feature_padding[1])
-
-            # To pad "beatsteps" and "extrapolated_beatstep".
-            else:
-                each_feature = each_feature.reshape(1, -1)
-                features_pad_value = max([*zip(*features_shapes)][0]) - features_shapes[i][0]
-                attention_mask = np.ones(features_shapes[i], dtype=np.int64).reshape(1, -1)
-                feature_padding = attention_mask_padding = ((0, 0), (0, features_pad_value))
-
-            each_padded_feature = np.pad(each_feature, feature_padding, "constant", constant_values=self.padding_value)
-            attention_mask = np.pad(
-                attention_mask, attention_mask_padding, "constant", constant_values=self.padding_value
-            )
-
-            if add_zero_line:
-                # if it is batched then we separate each examples using zero array
-                zero_array_len = max([*zip(*features_shapes)][1])
-
-                # we concatenate the zero array line here
-                each_padded_feature = np.concatenate(
-                    [each_padded_feature, np.zeros([1, zero_array_len, self.feature_size])], axis=0
-                )
-                attention_mask = np.concatenate(
-                    [attention_mask, np.zeros([1, zero_array_len], dtype=attention_mask.dtype)], axis=0
-                )
-
-            padded_features.append(each_padded_feature)
-            attention_masks.append(attention_mask)
-
-        padded_features = np.concatenate(padded_features, axis=0).astype(np.float32)
-        attention_masks = np.concatenate(attention_masks, axis=0).astype(np.int64)
-
-        return padded_features, attention_masks
-
-    def pad(
-        self,
-        inputs: BatchFeature,
-        is_batched: bool,
-        return_attention_mask: bool,
-        return_tensors: str | TensorType | None = None,
-    ):
-        """
-        Pads the inputs to same length and returns attention_mask.
-
-        Args:
-            inputs (`BatchFeature`):
-                Processed audio features.
-            is_batched (`bool`):
-                Whether inputs are batched or not.
-            return_attention_mask (`bool`):
-                Whether to return attention mask or not.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors instead of list of python integers. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return Numpy `np.ndarray` objects.
-                If nothing is specified, it will return list of `np.ndarray` arrays.
-        Return:
-            `BatchFeature` with attention_mask, attention_mask_beatsteps and attention_mask_extrapolated_beatstep added
-            to it:
-            - **attention_mask** numpy.ndarray of shape `(batch_size, max_input_features_seq_length)` --
-                Example :
-                    1, 1, 1, 0, 0 (audio 1, also here it is padded to max length of 5 that's why there are 2 zeros at
-                    the end indicating they are padded)
-
-                    0, 0, 0, 0, 0 (zero pad to separate audio 1 and 2)
-
-                    1, 1, 1, 1, 1 (audio 2)
-
-                    0, 0, 0, 0, 0 (zero pad to separate audio 2 and 3)
-
-                    1, 1, 1, 1, 1 (audio 3)
-            - **attention_mask_beatsteps** numpy.ndarray of shape `(batch_size, max_beatsteps_seq_length)`
-            - **attention_mask_extrapolated_beatstep** numpy.ndarray of shape `(batch_size,
-              max_extrapolated_beatstep_seq_length)`
-        """
-
-        processed_features_dict = {}
-        for feature_name, feature_value in inputs.items():
-            if feature_name == "input_features":
-                padded_feature_values, attention_mask = self._pad(feature_value, add_zero_line=True)
-                processed_features_dict[feature_name] = padded_feature_values
-                if return_attention_mask:
-                    processed_features_dict["attention_mask"] = attention_mask
-            else:
-                padded_feature_values, attention_mask = self._pad(feature_value, add_zero_line=False)
-                processed_features_dict[feature_name] = padded_feature_values
-                if return_attention_mask:
-                    processed_features_dict[f"attention_mask_{feature_name}"] = attention_mask
-
-        # If we are processing only one example, we should remove the zero array line since we don't need it to
-        # separate examples from each other.
-        if not is_batched and not return_attention_mask:
-            processed_features_dict["input_features"] = processed_features_dict["input_features"][:-1, ...]
-
-        outputs = BatchFeature(processed_features_dict, tensor_type=return_tensors)
-
-        return outputs
-
-    def __call__(
-        self,
-        audio: np.ndarray | list[float] | list[np.ndarray] | list[list[float]],
-        sampling_rate: int | list[int],
-        steps_per_beat: int = 2,
-        resample: bool | None = True,
-        return_attention_mask: bool | None = False,
-        return_tensors: str | TensorType | None = None,
-        **kwargs,
-    ) -> BatchFeature:
-        """
-        Main method to featurize and prepare for the model.
-
-        Args:
-            audio (`np.ndarray`, `List`):
-                The audio or batch of audio to be processed. Each audio can be a numpy array, a list of float values, a
-                list of numpy arrays or a list of list of float values.
-            sampling_rate (`int`):
-                The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass
-                `sampling_rate` at the forward call to prevent silent errors.
-            steps_per_beat (`int`, *optional*, defaults to 2):
-                This is used in interpolating `beat_times`.
-            resample (`bool`, *optional*, defaults to `True`):
-                Determines whether to resample the audio to `sampling_rate` or not before processing. Must be True
-                during inference.
-            return_attention_mask (`bool` *optional*, defaults to `False`):
-                Denotes if attention_mask for input_features, beatsteps and extrapolated_beatstep will be given as
-                output or not. Automatically set to True for batched inputs.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors instead of list of python integers. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return Numpy `np.ndarray` objects.
-                If nothing is specified, it will return list of `np.ndarray` arrays.
-        """
-
-        requires_backends(self, ["librosa"])
-        is_batched = isinstance(audio, (list, tuple)) and isinstance(audio[0], (np.ndarray, tuple, list))
-        if is_batched:
-            # This enables the user to process files of different sampling_rate at same time
-            if not isinstance(sampling_rate, list):
-                raise ValueError(
-                    "Please give sampling_rate of each audio separately when you are passing multiple raw_audios at the same time. "
-                    f"Received {sampling_rate}, expected [audio_1_sr, ..., audio_n_sr]."
-                )
-            return_attention_mask = True if return_attention_mask is None else return_attention_mask
-        else:
-            audio = [audio]
-            sampling_rate = [sampling_rate]
-            return_attention_mask = False if return_attention_mask is None else return_attention_mask
-
-        batch_input_features, batch_beatsteps, batch_ext_beatstep = [], [], []
-        for single_raw_audio, single_sampling_rate in zip(audio, sampling_rate):
-            bpm, beat_times, confidence, estimates, essentia_beat_intervals = self.extract_rhythm(
-                audio=single_raw_audio
-            )
-            beatsteps = self.interpolate_beat_times(beat_times=beat_times, steps_per_beat=steps_per_beat, n_extend=1)
-
-            if self.sampling_rate != single_sampling_rate and self.sampling_rate is not None:
-                if resample:
-                    # Change sampling_rate to self.sampling_rate
-                    single_raw_audio = librosa.core.resample(
-                        single_raw_audio,
-                        orig_sr=single_sampling_rate,
-                        target_sr=self.sampling_rate,
-                        res_type="kaiser_best",
-                    )
-                else:
-                    warnings.warn(
-                        f"The sampling_rate of the provided audio is different from the target sampling_rate "
-                        f"of the Feature Extractor, {self.sampling_rate} vs {single_sampling_rate}. "
-                        f"In these cases it is recommended to use `resample=True` in the `__call__` method to "
-                        f"get the optimal behaviour."
-                    )
-
-            single_sampling_rate = self.sampling_rate
-            start_sample = int(beatsteps[0] * single_sampling_rate)
-            end_sample = int(beatsteps[-1] * single_sampling_rate)
-
-            input_features, extrapolated_beatstep = self.preprocess_mel(
-                single_raw_audio[start_sample:end_sample], beatsteps - beatsteps[0]
-            )
-
-            mel_specs = self.mel_spectrogram(input_features.astype(np.float32))
-
-            # apply np.log to get log mel-spectrograms
-            log_mel_specs = np.log(np.clip(mel_specs, a_min=1e-6, a_max=None))
-
-            input_features = np.transpose(log_mel_specs, (0, -1, -2))
-
-            batch_input_features.append(input_features)
-            batch_beatsteps.append(beatsteps)
-            batch_ext_beatstep.append(extrapolated_beatstep)
-
-        output = BatchFeature(
-            {
-                "input_features": batch_input_features,
-                "beatsteps": batch_beatsteps,
-                "extrapolated_beatstep": batch_ext_beatstep,
-            }
-        )
-
-        output = self.pad(
-            output,
-            is_batched=is_batched,
-            return_attention_mask=return_attention_mask,
-            return_tensors=return_tensors,
-        )
-
-        return output
 
+Pop2PianoFeatureExtractor = deprecated_feature_extractor(Pop2PianoAudioProcessor, "Pop2PianoFeatureExtractor")
 
 __all__ = ["Pop2PianoFeatureExtractor"]
diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
index dca38d2a1d01..49490e8b9bce 100644
--- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
@@ -19,7 +19,7 @@
 """Image processor class for Qwen2-VL."""
 
 import math
-from typing import Iterable
+from collections.abc import Iterable
 
 import torch
 
diff --git a/src/transformers/models/seamless_m4t/audio_processing_seamless_m4t.py b/src/transformers/models/seamless_m4t/audio_processing_seamless_m4t.py
new file mode 100644
index 000000000000..de597178b446
--- /dev/null
+++ b/src/transformers/models/seamless_m4t/audio_processing_seamless_m4t.py
@@ -0,0 +1,94 @@
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ...audio_processing_backends import NumpyAudioBackend
+from ...audio_utils import MelScaleConfig, SpectrogramConfig, StftConfig
+
+
+class SeamlessM4tAudioProcessor(NumpyAudioBackend):
+    sample_rate = 16000
+    force_mono = True
+    do_batch_spectrogram = False
+    stride = 2
+    pad_to_multiple_of = 2  # Align feature padding to stride
+
+    spectrogram_config = SpectrogramConfig(
+        stft_config=StftConfig(
+            n_fft=512,
+            win_length=400,
+            hop_length=160,
+            window_fn="povey",
+            power=2.0,
+            center=False,
+            periodic=False,
+        ),
+        mel_scale_config=MelScaleConfig(
+            n_mels=80,
+            f_min=20.0,
+            f_max=8000.0,
+            mel_scale="kaldi",
+            triangularize_in_mel_space=True,
+        ),
+        log_mode="log",
+        preemphasis=0.97,
+        remove_dc_offset=True,
+        mel_floor=1.192092955078125e-07,
+        computation_dtype="float64",
+    )
+    waveform_scale = 32768.0
+
+    def extract_spectrogram(self, audio, **kwargs):
+        # Per-waveform fbank extraction returning (time, n_mels)
+        features = []
+        for waveform in audio:
+            waveform = np.squeeze(waveform) * self.waveform_scale
+            f = super().extract_spectrogram([waveform], spectrogram_config=self.spectrogram_config)
+            features.append(f[0].T)
+        return features
+
+    def _postprocess_features(self, features, feature_lengths):
+        # Per-utterance mean/variance normalization (before padding)
+        normalized = []
+        for f in features:
+            mean = np.expand_dims(f.mean(axis=0), 0)
+            var = np.expand_dims(f.var(axis=0, ddof=1), 0)
+            normalized.append((f - mean) / np.sqrt(var + 1e-7))
+        return normalized
+
+    def _postprocess_output(self, output, feature_ranges=None, **kwargs):
+        features = output["audio_features"]  # (batch, num_frames, num_channels)
+        batch_size, num_frames, num_channels = features.shape
+
+        # Stride concatenation
+        remainder = num_frames % self.stride
+        if remainder != 0:
+            features = features[:, :num_frames - remainder, :]
+            num_frames = num_frames - remainder
+
+        output["audio_features"] = features.reshape(batch_size, num_frames // self.stride, num_channels * self.stride)
+
+        # Adjust mask for stride
+        if "audio_features_mask" in output:
+            mask = output["audio_features_mask"]
+            if remainder != 0:
+                mask = mask[:, :num_frames]
+            indices = np.arange(0, num_frames)
+            output["audio_features_mask"] = mask[:, indices % self.stride == 1]
+
+        return output
+
+
+__all__ = ["SeamlessM4tAudioProcessor"]
diff --git a/src/transformers/models/seamless_m4t/feature_extraction_seamless_m4t.py b/src/transformers/models/seamless_m4t/feature_extraction_seamless_m4t.py
index 1b18dcc33404..174bc72baa16 100644
--- a/src/transformers/models/seamless_m4t/feature_extraction_seamless_m4t.py
+++ b/src/transformers/models/seamless_m4t/feature_extraction_seamless_m4t.py
@@ -11,295 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
-Feature extractor class for SeamlessM4T
-"""
+from ...utils.deprecation import deprecated_feature_extractor
+from .audio_processing_seamless_m4t import SeamlessM4tAudioProcessor
 
-import numpy as np
-
-from ...utils import is_torch_available
-
-
-if is_torch_available():
-    import torch
-
-from ...audio_utils import mel_filter_bank, spectrogram, window_function
-from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
-from ...feature_extraction_utils import BatchFeature
-from ...utils import PaddingStrategy, TensorType, logging
-
-
-logger = logging.get_logger(__name__)
-
-
-class SeamlessM4TFeatureExtractor(SequenceFeatureExtractor):
-    r"""
-    Constructs a SeamlessM4T feature extractor.
-
-    This feature extractor inherits from [`SequenceFeatureExtractor`] which contains most of the main methods. Users
-    should refer to this superclass for more information regarding those methods.
-
-    This class extracts mel-filter bank features from raw speech.
-
-    Args:
-        feature_size (`int`, *optional*, defaults to 80):
-            The feature dimension of the extracted features.
-        sampling_rate (`int`, *optional*, defaults to 16000):
-            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
-        num_mel_bins (`int`, *optional*, defaults to 80):
-            Number of Mel-frequency bins.
-        padding_value (`float`, *optional*, defaults to 0.0):
-            The value that is used to fill the padding vectors.
-        stride (`int`, *optional*, defaults to 2):
-            Stride used to reshape audios from shape (batch_size,num_frames,num_mel_bins) to
-            (batch_size,num_frames//stride,num_mel_bins*stride).
-    """
-
-    model_input_names = ["input_features", "attention_mask"]
-
-    def __init__(
-        self,
-        feature_size=80,
-        sampling_rate=16000,
-        num_mel_bins=80,
-        padding_value=0.0,
-        stride=2,
-        **kwargs,
-    ):
-        self.num_mel_bins = num_mel_bins
-        self.return_attention_mask = True
-        self.stride = stride
-
-        mel_filters = mel_filter_bank(
-            num_frequency_bins=257,
-            num_mel_filters=self.num_mel_bins,
-            min_frequency=20,
-            max_frequency=sampling_rate // 2,
-            sampling_rate=sampling_rate,
-            norm=None,
-            mel_scale="kaldi",
-            triangularize_in_mel_space=True,
-        )
-
-        self.mel_filters = mel_filters
-        self.window = window_function(400, "povey", periodic=False)
-
-        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
-
-    @staticmethod
-    # Copied from transformers.models.wav2vec2.feature_extraction_wav2vec2.Wav2Vec2FeatureExtractor.zero_mean_unit_var_norm
-    def zero_mean_unit_var_norm(
-        input_values: list[np.ndarray], attention_mask: list[np.ndarray], padding_value: float = 0.0
-    ) -> list[np.ndarray]:
-        """
-        Every array in the list is normalized to have zero mean and unit variance
-        """
-        if attention_mask is not None:
-            attention_mask = np.array(attention_mask, np.int32)
-            normed_input_values = []
-
-            for vector, length in zip(input_values, attention_mask.sum(-1)):
-                normed_slice = (vector - vector[:length].mean()) / np.sqrt(vector[:length].var() + 1e-7)
-                if length < normed_slice.shape[0]:
-                    normed_slice[length:] = padding_value
-
-                normed_input_values.append(normed_slice)
-        else:
-            normed_input_values = [(x - x.mean()) / np.sqrt(x.var() + 1e-7) for x in input_values]
-
-        return normed_input_values
-
-    def _extract_fbank_features(
-        self,
-        waveform: np.ndarray,
-    ) -> np.ndarray:
-        """
-        Get mel-filter bank features using TorchAudio. Note that TorchAudio requires 16-bit signed integers as inputs
-        and hence the waveform should not be normalized before feature extraction.
-        """
-        # by default, it extracts the left channel if stereo
-        if len(waveform.shape) == 2:
-            waveform = waveform[0]
-
-        waveform = np.squeeze(waveform) * (2**15)  # Kaldi compliance: 16-bit signed integers
-        features = spectrogram(
-            waveform,
-            self.window,
-            frame_length=400,
-            hop_length=160,
-            fft_length=512,
-            power=2.0,
-            center=False,
-            preemphasis=0.97,
-            mel_filters=self.mel_filters,
-            log_mel="log",
-            mel_floor=1.192092955078125e-07,
-            remove_dc_offset=True,
-        ).T
-        return features
-
-    def __call__(
-        self,
-        raw_speech: np.ndarray | list[float] | list[np.ndarray] | list[list[float]],
-        padding: bool | str | PaddingStrategy = True,
-        pad_to_multiple_of: int | None = 2,
-        max_length: int | None = None,
-        truncation: bool = False,
-        return_tensors: str | TensorType | None = None,
-        sampling_rate: int | None = None,
-        return_attention_mask: bool | None = None,
-        do_normalize_per_mel_bins: bool | None = True,
-        **kwargs,
-    ) -> BatchFeature:
-        """
-        Main method to featurize and prepare for the model one or several sequence(s).
-
-        Args:
-            raw_speech (`np.ndarray`, `torch.Tensor`, `list[float]`, `list[np.ndarray]`, `list[torch.Tensor]`,
-            `list[list[float]]`, `list[list[list[float]]]`):
-                The sequence or batch of sequences to be padded. Each sequence can be a numpy array,
-                a torch tensor, a list of float values, a list of numpy arrays, a list of torch tensors,
-                a list of list of float values or a list of a list of list of float values.
-                If `raw_speech` is a one-dimensional `np.ndarray`, `torch.Tensor` or a `list[float]`, `raw_speech` is
-                considered a single-channel, single-sample sound. In all other cases, the first dimension of
-                `raw_speech`, whether from an `np.ndarray`, a `torch.Tensor` or a `list[...]`,
-                corresponds to the number of samples in the batch, and the number of channels
-                (i.e. mono or stereo character) is derived from the other dimensions
-                (1D -> single-channel waveform batches; 2D-> stereo-channel waveform batches).
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
-                Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                index) among:
-
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            pad_to_multiple_of (`int`, *optional*, defaults to 2):
-                If set will pad the sequence to a multiple of the provided value.
-
-                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
-                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
-            max_length (`int`, *optional*):
-                Maximum length of the returned list and optionally padding length (see above).
-            truncation (`bool`):
-                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
-            return_attention_mask (`bool`, *optional*):
-                Whether to return the attention mask. If left to the default, will return the attention mask according
-                to the specific feature_extractor's default.
-
-                [What are attention masks?](../glossary#attention-mask)
-
-                <Tip>
-
-                For SeamlessM4T models, `attention_mask` should always be passed for batched inference, to avoid subtle
-                bugs.
-
-                </Tip>
-
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors instead of list of python integers. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return Numpy `np.ndarray` objects.
-            sampling_rate (`int`, *optional*):
-                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
-                `sampling_rate` at the forward call to prevent silent errors.
-            do_normalize_per_mel_bins (`bool`, *optional*, defaults to `True`):
-                Whether or not to zero-mean unit-variance normalize the input per mel-channel.
-            kwargs (*optional*):
-                Remaining dictionary of keyword arguments that will be passed to the tokenizer or the feature
-                extractor.
-        """
-        if sampling_rate is not None:
-            if sampling_rate != self.sampling_rate:
-                raise ValueError(
-                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
-                    f" {self.sampling_rate}. Please make sure that the provided `raw_speech` input was sampled with"
-                    f" {self.sampling_rate} and not {sampling_rate}."
-                )
-        else:
-            logger.warning(
-                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
-                "Failing to do so can result in silent errors that might be hard to debug."
-            )
-
-        return_attention_mask = (
-            return_attention_mask if return_attention_mask is not None else self.return_attention_mask
-        )
-
-        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
-        if is_batched_numpy and len(raw_speech.shape) > 3:
-            raise ValueError(f"Only mono-channel or stereo-channel audio is supported for input to {self}")
-
-        acceptable_types = (
-            (torch.Tensor, np.ndarray, tuple, list) if is_torch_available() else (np.ndarray, tuple, list)
-        )
-        is_batched = is_batched_numpy or (
-            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], acceptable_types))
-        )
-
-        if is_batched:
-            raw_speech = [np.asarray(speech, dtype=np.float32) for speech in raw_speech]
-        elif not is_batched and not isinstance(raw_speech, np.ndarray):
-            raw_speech = np.asarray(raw_speech, dtype=np.float32)
-        elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
-            raw_speech = raw_speech.astype(np.float32)
-
-        # always return batch
-        if not is_batched:
-            raw_speech = [raw_speech]
-
-        # extract fbank features
-        features = [self._extract_fbank_features(waveform) for waveform in raw_speech]
-
-        if do_normalize_per_mel_bins:
-            # torch defaults to ddof=1, and numpy defaults to ddof=0
-            features = [
-                (x - np.expand_dims(x.mean(0), 0)) / np.sqrt(np.expand_dims(x.var(0, ddof=1), 0) + 1e-7)
-                for x in features
-            ]
-
-        # convert into correct format for padding
-        encoded_inputs = BatchFeature({"input_features": features})
-
-        padded_inputs = self.pad(
-            encoded_inputs,
-            padding=padding,
-            max_length=max_length,
-            truncation=truncation,
-            pad_to_multiple_of=pad_to_multiple_of,
-            return_attention_mask=True,
-            return_tensors="np",
-        )
-
-        # SeamlessM4T needs to process extracted features
-        input_features = padded_inputs.get("input_features")
-        attention_mask = padded_inputs.pop("attention_mask")
-
-        batch_size, num_frames, num_channels = input_features.shape
-
-        remainder = num_frames % self.stride
-        if remainder != 0:
-            input_features = input_features[:, : num_frames - remainder, :]
-            attention_mask = attention_mask[:, : num_frames - remainder]
-
-        input_features = np.reshape(
-            input_features, (batch_size, num_frames // self.stride, num_channels * self.stride)
-        )
-
-        indices = np.arange(0, num_frames - remainder)
-        attention_mask = attention_mask[:, indices % self.stride == 1]
-
-        padded_inputs["input_features"] = input_features
-        if return_attention_mask:
-            padded_inputs["attention_mask"] = attention_mask
-
-        if return_tensors is not None:
-            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
-
-        return padded_inputs
 
+SeamlessM4TFeatureExtractor = deprecated_feature_extractor(SeamlessM4tAudioProcessor, "SeamlessM4TFeatureExtractor")
 
 __all__ = ["SeamlessM4TFeatureExtractor"]
diff --git a/src/transformers/models/speech_to_text/audio_processing_speech_to_text.py b/src/transformers/models/speech_to_text/audio_processing_speech_to_text.py
new file mode 100644
index 000000000000..5f9717738982
--- /dev/null
+++ b/src/transformers/models/speech_to_text/audio_processing_speech_to_text.py
@@ -0,0 +1,88 @@
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ...audio_processing_backends import NumpyAudioBackend
+from ...audio_utils import MelScaleConfig, SpectrogramConfig, StftConfig
+
+class SpeechToTextAudioProcessor(NumpyAudioBackend):
+    sample_rate = 16000
+    force_mono = True
+    do_batch_spectrogram = False
+
+    spectrogram_config = SpectrogramConfig(
+        stft_config=StftConfig(
+            n_fft=512,
+            win_length=400,
+            hop_length=160,
+            window_fn="povey",
+            power=2.0,
+            center=False,
+            periodic=False,
+        ),
+        mel_scale_config=MelScaleConfig(
+            n_mels=80,
+            f_min=20.0,
+            f_max=8000.0,
+            mel_scale="kaldi",
+            triangularize_in_mel_space=True,
+        ),
+        log_mode="log",
+        preemphasis=0.97,
+        remove_dc_offset=True,
+        mel_floor=1.192092955078125e-07,
+    )
+    waveform_scale = 32768.0
+
+    def __init__(self, normalize_means=True, normalize_vars=True, **kwargs):
+        super().__init__(**kwargs)
+        self.normalize_means = normalize_means
+        self.normalize_vars = normalize_vars
+
+    def _extract_fbank_features(self, waveform):
+        """Extract log-mel filterbank features for a single waveform."""
+        waveform = waveform * self.waveform_scale
+        return self._kaldi_fbank(waveform, num_mel_bins=80)
+
+    def extract_spectrogram(self, audio, **kwargs):
+        # Per-waveform fbank extraction returning (time, n_mels)
+        return [self._extract_fbank_features(waveform) for waveform in audio]
+
+    @staticmethod
+    def utterance_cmvn(x, input_length, normalize_means=True, normalize_vars=True, padding_value=0.0):
+        if normalize_means:
+            mean = x[:input_length].mean(axis=0)
+            x = np.subtract(x, mean)
+        if normalize_vars:
+            std = x[:input_length].std(axis=0)
+            x = np.divide(x, std)
+        if input_length < x.shape[0]:
+            x[input_length:] = padding_value
+        return x.astype(np.float32)
+
+    def _postprocess_output(self, output, feature_ranges=None, **kwargs):
+        # Apply utterance CMVN normalization on the padded, stacked features
+        features = output["audio_features"]  # (batch, time, n_mels)
+        normalized = []
+        for i, (start, end) in enumerate(feature_ranges):
+            length = end - start
+            normalized.append(
+                self.utterance_cmvn(features[i], length, self.normalize_means, self.normalize_vars, self.padding_value)
+            )
+        output["audio_features"] = np.stack(normalized)
+        return output
+
+
+__all__ = ["SpeechToTextAudioProcessor"]
diff --git a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py
index 9685e9be0134..584afc35f229 100644
--- a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py
@@ -11,301 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
-Feature extractor class for Speech2Text
-"""
+from ...utils.deprecation import deprecated_feature_extractor
+from .audio_processing_speech_to_text import SpeechToTextAudioProcessor
 
-import numpy as np
-
-from ...audio_utils import mel_filter_bank, spectrogram, window_function
-from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
-from ...feature_extraction_utils import BatchFeature
-from ...utils import PaddingStrategy, TensorType, is_speech_available, logging
-
-
-if is_speech_available():
-    import torch
-    import torchaudio.compliance.kaldi as ta_kaldi
-
-logger = logging.get_logger(__name__)
-
-
-class Speech2TextFeatureExtractor(SequenceFeatureExtractor):
-    r"""
-    Constructs a Speech2Text feature extractor.
-
-    This feature extractor inherits from [`Speech2TextFeatureExtractor`] which contains most of the main methods. Users
-    should refer to this superclass for more information regarding those methods.
-
-    This class extracts mel-filter bank features from raw speech using TorchAudio if installed or using numpy
-    otherwise, and applies utterance-level cepstral mean and variance normalization to the extracted features.
-
-    Args:
-        feature_size (`int`, *optional*, defaults to 80):
-            The feature dimension of the extracted features.
-        sampling_rate (`int`, *optional*, defaults to 16000):
-            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
-        num_mel_bins (`int`, *optional*, defaults to 80):
-            Number of Mel-frequency bins.
-        padding_value (`float`, *optional*, defaults to 0.0):
-            The value that is used to fill the padding vectors.
-        dither (`float`, *optional*, defaults to 0.0):
-            Adds dithering. In other words, adds a small Gaussian noise to each frame.
-            E.g. use 4.0 to add dithering with a normal distribution centered
-            around 0.0 with standard deviation 4.0 (assuming [-32k,+32k] range of kaldi waveform).
-            The value 0.0 means no dithering.
-            Dithering has similar effect as `mel_floor`. It reduces the high log_mel_fbank
-            values for signals with hard-zero sections, when VAD cutoff is present in the signal.
-        do_ceptral_normalize (`bool`, *optional*, defaults to `True`):
-            Whether or not to apply utterance-level cepstral mean and variance normalization to extracted features.
-        normalize_means (`bool`, *optional*, defaults to `True`):
-            Whether or not to zero-mean normalize the extracted features.
-        normalize_vars (`bool`, *optional*, defaults to `True`):
-            Whether or not to unit-variance normalize the extracted features.
-    """
-
-    model_input_names = ["input_features", "attention_mask"]
-
-    def __init__(
-        self,
-        feature_size=80,
-        sampling_rate=16000,
-        num_mel_bins=80,
-        padding_value=0.0,
-        dither=0.0,
-        do_ceptral_normalize=True,
-        normalize_means=True,
-        normalize_vars=True,
-        **kwargs,
-    ):
-        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
-        self.num_mel_bins = num_mel_bins
-        self.dither = dither
-        self.do_ceptral_normalize = do_ceptral_normalize
-        self.normalize_means = normalize_means
-        self.normalize_vars = normalize_vars
-        self.return_attention_mask = True
-
-        if not is_speech_available():
-            mel_filters = mel_filter_bank(
-                num_frequency_bins=257,
-                num_mel_filters=self.num_mel_bins,
-                min_frequency=20,
-                max_frequency=sampling_rate // 2,
-                sampling_rate=sampling_rate,
-                norm=None,
-                mel_scale="kaldi",
-                triangularize_in_mel_space=True,
-            )
-
-            self.mel_filters = mel_filters
-            self.window = window_function(400, "povey", periodic=False)
-
-    def _extract_fbank_features(
-        self,
-        waveform: np.ndarray,
-    ) -> np.ndarray:
-        """
-        Get mel-filter bank features using TorchAudio. Note that TorchAudio requires 16-bit signed integers as inputs
-        and hence the waveform should not be normalized before feature extraction.
-        """
-        waveform = waveform * (2**15)  # Kaldi compliance: 16-bit signed integers
-        if is_speech_available():
-            waveform = torch.from_numpy(waveform).unsqueeze(0)
-            features = ta_kaldi.fbank(
-                waveform,
-                dither=self.dither,
-                num_mel_bins=self.num_mel_bins,
-                sample_frequency=self.sampling_rate,
-            )
-            features = features.numpy()
-        else:
-            waveform = np.squeeze(waveform)
-            features = spectrogram(
-                waveform,
-                self.window,
-                frame_length=400,
-                hop_length=160,
-                fft_length=512,
-                power=2.0,
-                center=False,
-                dither=self.dither,
-                preemphasis=0.97,
-                mel_filters=self.mel_filters,
-                log_mel="log",
-                mel_floor=1.192092955078125e-07,
-                remove_dc_offset=True,
-            ).T
-        return features
-
-    @staticmethod
-    def utterance_cmvn(
-        x: np.ndarray,
-        input_length: int,
-        normalize_means: bool | None = True,
-        normalize_vars: bool | None = True,
-        padding_value: float = 0.0,
-    ) -> np.ndarray:
-        # make sure we normalize float32 arrays
-        if normalize_means:
-            mean = x[:input_length].mean(axis=0)
-            x = np.subtract(x, mean)
-        if normalize_vars:
-            std = x[:input_length].std(axis=0)
-            x = np.divide(x, std)
-
-        if input_length < x.shape[0]:
-            x[input_length:] = padding_value
-
-        # make sure array is in float32
-        x = x.astype(np.float32)
-
-        return x
-
-    def normalize(
-        self, input_features: list[np.ndarray], attention_mask: np.ndarray | None = None
-    ) -> list[np.ndarray]:
-        lengths = attention_mask.sum(-1) if attention_mask is not None else [x.shape[0] for x in input_features]
-        return [
-            self.utterance_cmvn(x, n, self.normalize_means, self.normalize_vars, self.padding_value)
-            for x, n in zip(input_features, lengths)
-        ]
-
-    def __call__(
-        self,
-        raw_speech: np.ndarray | list[float] | list[np.ndarray] | list[list[float]],
-        padding: bool | str | PaddingStrategy = False,
-        max_length: int | None = None,
-        truncation: bool = False,
-        pad_to_multiple_of: int | None = None,
-        return_tensors: str | TensorType | None = None,
-        sampling_rate: int | None = None,
-        return_attention_mask: bool | None = None,
-        **kwargs,
-    ) -> BatchFeature:
-        """
-        Main method to featurize and prepare for the model one or several sequence(s).
-
-        Args:
-            raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
-                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
-                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
-                stereo, i.e. single float per timestep.
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
-                Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                index) among:
-
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            max_length (`int`, *optional*):
-                Maximum length of the returned list and optionally padding length (see above).
-            truncation (`bool`):
-                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
-            pad_to_multiple_of (`int`, *optional*):
-                If set will pad the sequence to a multiple of the provided value.
-
-                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
-                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
-            return_attention_mask (`bool`, *optional*):
-                Whether to return the attention mask. If left to the default, will return the attention mask according
-                to the specific feature_extractor's default.
-
-                [What are attention masks?](../glossary#attention-mask)
-
-                <Tip>
-
-                For Speech2TextTransformer models, `attention_mask` should always be passed for batched inference, to
-                avoid subtle bugs.
-
-                </Tip>
-
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors instead of list of python integers. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return Numpy `np.ndarray` objects.
-            sampling_rate (`int`, *optional*):
-                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
-                `sampling_rate` at the forward call to prevent silent errors.
-            padding_value (`float`, *optional*, defaults to 0.0):
-                The value that is used to fill the padding values / vectors.
-        """
-
-        if sampling_rate is not None:
-            if sampling_rate != self.sampling_rate:
-                raise ValueError(
-                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
-                    f" {self.sampling_rate}. Please make sure that the provided `raw_speech` input was sampled with"
-                    f" {self.sampling_rate} and not {sampling_rate}."
-                )
-        else:
-            logger.warning(
-                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
-                "Failing to do so can result in silent errors that might be hard to debug."
-            )
-
-        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
-        if is_batched_numpy and len(raw_speech.shape) > 2:
-            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
-        is_batched = is_batched_numpy or (
-            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
-        )
-
-        if is_batched:
-            raw_speech = [np.asarray(speech, dtype=np.float32) for speech in raw_speech]
-        elif not is_batched and not isinstance(raw_speech, np.ndarray):
-            raw_speech = np.asarray(raw_speech, dtype=np.float32)
-        elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
-            raw_speech = raw_speech.astype(np.float32)
-
-        # always return batch
-        if not is_batched:
-            raw_speech = [raw_speech]
-
-        # extract fbank features
-        features = [self._extract_fbank_features(waveform) for waveform in raw_speech]
-
-        # convert into correct format for padding
-        encoded_inputs = BatchFeature({"input_features": features})
-
-        padded_inputs = self.pad(
-            encoded_inputs,
-            padding=padding,
-            max_length=max_length,
-            truncation=truncation,
-            pad_to_multiple_of=pad_to_multiple_of,
-            return_attention_mask=return_attention_mask,
-            **kwargs,
-        )
-
-        # make sure list is in array format
-        input_features = padded_inputs.get("input_features")
-        if isinstance(input_features[0], list):
-            padded_inputs["input_features"] = [np.asarray(feature, dtype=np.float32) for feature in input_features]
-
-        attention_mask = padded_inputs.get("attention_mask")
-        if attention_mask is not None:
-            padded_inputs["attention_mask"] = [np.asarray(array, dtype=np.int32) for array in attention_mask]
-
-        # Utterance-level cepstral mean and variance normalization
-        if self.do_ceptral_normalize:
-            attention_mask = (
-                np.array(attention_mask, dtype=np.int32)
-                if self._get_padding_strategies(padding, max_length=max_length) is not PaddingStrategy.DO_NOT_PAD
-                else None
-            )
-            padded_inputs["input_features"] = self.normalize(
-                padded_inputs["input_features"], attention_mask=attention_mask
-            )
-
-        if return_tensors is not None:
-            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
-
-        return padded_inputs
 
+Speech2TextFeatureExtractor = deprecated_feature_extractor(SpeechToTextAudioProcessor, "Speech2TextFeatureExtractor")
 
 __all__ = ["Speech2TextFeatureExtractor"]
diff --git a/src/transformers/models/speecht5/audio_processing_speecht5.py b/src/transformers/models/speecht5/audio_processing_speecht5.py
new file mode 100644
index 000000000000..4fc4c2226d35
--- /dev/null
+++ b/src/transformers/models/speecht5/audio_processing_speecht5.py
@@ -0,0 +1,23 @@
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...audio_processing_backends import TorchAudioBackend
+
+
+class SpeechT5AudioProcessor(TorchAudioBackend):
+    sample_rate = 16000
+    force_mono = True
+
+
+__all__ = ["SpeechT5AudioProcessor"]
diff --git a/src/transformers/models/speecht5/feature_extraction_speecht5.py b/src/transformers/models/speecht5/feature_extraction_speecht5.py
index 5b9ca2e1f954..1aece171a6f3 100644
--- a/src/transformers/models/speecht5/feature_extraction_speecht5.py
+++ b/src/transformers/models/speecht5/feature_extraction_speecht5.py
@@ -11,364 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Feature extractor class for SpeechT5."""
+from ...utils.deprecation import deprecated_feature_extractor
+from .audio_processing_speecht5 import SpeechT5AudioProcessor
 
-from typing import Any
-
-import numpy as np
-
-from ...audio_utils import mel_filter_bank, optimal_fft_length, spectrogram, window_function
-from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
-from ...feature_extraction_utils import BatchFeature
-from ...utils import PaddingStrategy, TensorType, logging
-
-
-logger = logging.get_logger(__name__)
-
-
-class SpeechT5FeatureExtractor(SequenceFeatureExtractor):
-    r"""
-    Constructs a SpeechT5 feature extractor.
-
-    This class can pre-process a raw speech signal by (optionally) normalizing to zero-mean unit-variance, for use by
-    the SpeechT5 speech encoder prenet.
-
-    This class can also extract log-mel filter bank features from raw speech, for use by the SpeechT5 speech decoder
-    prenet.
-
-    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
-    most of the main methods. Users should refer to this superclass for more information regarding those methods.
-
-    Args:
-        feature_size (`int`, *optional*, defaults to 1):
-            The feature dimension of the extracted features.
-        sampling_rate (`int`, *optional*, defaults to 16000):
-            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
-        padding_value (`float`, *optional*, defaults to 0.0):
-            The value that is used to fill the padding values.
-        do_normalize (`bool`, *optional*, defaults to `False`):
-            Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
-            improve the performance for some models.
-        num_mel_bins (`int`, *optional*, defaults to 80):
-            The number of mel-frequency bins in the extracted spectrogram features.
-        hop_length (`int`, *optional*, defaults to 16):
-            Number of ms between windows. Otherwise referred to as "shift" in many papers.
-        win_length (`int`, *optional*, defaults to 64):
-            Number of ms per window.
-        win_function (`str`, *optional*, defaults to `"hann_window"`):
-            Name for the window function used for windowing, must be accessible via `torch.{win_function}`
-        fmin (`float`, *optional*, defaults to 80):
-            Minimum mel frequency in Hz.
-        fmax (`float`, *optional*, defaults to 7600):
-            Maximum mel frequency in Hz.
-        mel_floor (`float`, *optional*, defaults to 1e-10):
-            Minimum value of mel frequency banks..
-        return_attention_mask (`bool`, *optional*, defaults to `True`):
-            Whether or not [`~SpeechT5FeatureExtractor.__call__`] should return `attention_mask`.
-    """
-
-    model_input_names = ["input_values", "attention_mask"]
-
-    def __init__(
-        self,
-        feature_size: int = 1,
-        sampling_rate: int = 16000,
-        padding_value: float = 0.0,
-        do_normalize: bool = False,
-        num_mel_bins: int = 80,
-        hop_length: int = 16,
-        win_length: int = 64,
-        win_function: str = "hann_window",
-        fmin: float = 80,
-        fmax: float = 7600,
-        mel_floor: float = 1e-10,
-        return_attention_mask: bool = True,
-        **kwargs,
-    ):
-        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
-        self.do_normalize = do_normalize
-        self.return_attention_mask = return_attention_mask
-
-        self.num_mel_bins = num_mel_bins
-        self.hop_length = hop_length
-        self.win_length = win_length
-        self.win_function = win_function
-        self.fmin = fmin
-        self.fmax = fmax
-        self.mel_floor = mel_floor
-
-        self.sample_size = win_length * sampling_rate // 1000
-        self.sample_stride = hop_length * sampling_rate // 1000
-        self.n_fft = optimal_fft_length(self.sample_size)
-        self.n_freqs = (self.n_fft // 2) + 1
-
-        self.window = window_function(window_length=self.sample_size, name=self.win_function, periodic=True)
-
-        self.mel_filters = mel_filter_bank(
-            num_frequency_bins=self.n_freqs,
-            num_mel_filters=self.num_mel_bins,
-            min_frequency=self.fmin,
-            max_frequency=self.fmax,
-            sampling_rate=self.sampling_rate,
-            norm="slaney",
-            mel_scale="slaney",
-        )
-
-    @staticmethod
-    # Copied from transformers.models.wav2vec2.feature_extraction_wav2vec2.Wav2Vec2FeatureExtractor.zero_mean_unit_var_norm
-    def zero_mean_unit_var_norm(
-        input_values: list[np.ndarray], attention_mask: list[np.ndarray], padding_value: float = 0.0
-    ) -> list[np.ndarray]:
-        """
-        Every array in the list is normalized to have zero mean and unit variance
-        """
-        if attention_mask is not None:
-            attention_mask = np.array(attention_mask, np.int32)
-            normed_input_values = []
-
-            for vector, length in zip(input_values, attention_mask.sum(-1)):
-                normed_slice = (vector - vector[:length].mean()) / np.sqrt(vector[:length].var() + 1e-7)
-                if length < normed_slice.shape[0]:
-                    normed_slice[length:] = padding_value
-
-                normed_input_values.append(normed_slice)
-        else:
-            normed_input_values = [(x - x.mean()) / np.sqrt(x.var() + 1e-7) for x in input_values]
-
-        return normed_input_values
-
-    def _extract_mel_features(
-        self,
-        one_waveform: np.ndarray,
-    ) -> np.ndarray:
-        """
-        Extracts log-mel filterbank features for one waveform array (unbatched).
-        """
-        log_mel_spec = spectrogram(
-            one_waveform,
-            window=self.window,
-            frame_length=self.sample_size,
-            hop_length=self.sample_stride,
-            fft_length=self.n_fft,
-            mel_filters=self.mel_filters,
-            mel_floor=self.mel_floor,
-            log_mel="log10",
-        )
-        return log_mel_spec.T
-
-    def __call__(
-        self,
-        audio: np.ndarray | list[float] | list[np.ndarray] | list[list[float]] | None = None,
-        audio_target: np.ndarray | list[float] | list[np.ndarray] | list[list[float]] | None = None,
-        padding: bool | str | PaddingStrategy = False,
-        max_length: int | None = None,
-        truncation: bool = False,
-        pad_to_multiple_of: int | None = None,
-        return_attention_mask: bool | None = None,
-        return_tensors: str | TensorType | None = None,
-        sampling_rate: int | None = None,
-        **kwargs,
-    ) -> BatchFeature:
-        """
-        Main method to featurize and prepare for the model one or several sequence(s).
-
-        Pass in a value for `audio` to extract waveform features. Pass in a value for `audio_target` to extract log-mel
-        spectrogram features.
-
-        Args:
-            audio (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`, *optional*):
-                The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
-                values, a list of numpy arrays or a list of list of float values. This outputs waveform features. Must
-                be mono channel audio, not stereo, i.e. single float per timestep.
-            audio_target (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`, *optional*):
-                The sequence or batch of sequences to be processed as targets. Each sequence can be a numpy array, a
-                list of float values, a list of numpy arrays or a list of list of float values. This outputs log-mel
-                spectrogram features.
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
-                Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                index) among:
-
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            max_length (`int`, *optional*):
-                Maximum length of the returned list and optionally padding length (see above).
-            truncation (`bool`):
-                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
-            pad_to_multiple_of (`int`, *optional*):
-                If set will pad the sequence to a multiple of the provided value.
-
-                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
-                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
-            return_attention_mask (`bool`, *optional*):
-                Whether to return the attention mask. If left to the default, will return the attention mask according
-                to the specific feature_extractor's default.
-
-                [What are attention masks?](../glossary#attention-mask)
-
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors instead of list of python integers. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return Numpy `np.ndarray` objects.
-            sampling_rate (`int`, *optional*):
-                The sampling rate at which the `audio` or `audio_target` input was sampled. It is strongly recommended
-                to pass `sampling_rate` at the forward call to prevent silent errors.
-        """
-        if audio is None and audio_target is None:
-            raise ValueError("You must provide either `audio` or `audio_target` values.")
-
-        if sampling_rate is not None:
-            if sampling_rate != self.sampling_rate:
-                raise ValueError(
-                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
-                    f" {self.sampling_rate}. Please make sure that the provided audio input was sampled with"
-                    f" {self.sampling_rate} and not {sampling_rate}."
-                )
-        else:
-            logger.warning(
-                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
-                "Failing to do so can result in silent errors that might be hard to debug."
-            )
-
-        if audio is not None:
-            inputs = self._process_audio(
-                audio,
-                False,
-                padding,
-                max_length,
-                truncation,
-                pad_to_multiple_of,
-                return_attention_mask,
-                return_tensors,
-                **kwargs,
-            )
-        else:
-            inputs = None
-
-        if audio_target is not None:
-            inputs_target = self._process_audio(
-                audio_target,
-                True,
-                padding,
-                max_length,
-                truncation,
-                pad_to_multiple_of,
-                return_attention_mask,
-                return_tensors,
-                **kwargs,
-            )
-
-            if inputs is None:
-                return inputs_target
-            else:
-                inputs["labels"] = inputs_target["input_values"]
-                decoder_attention_mask = inputs_target.get("attention_mask")
-                if decoder_attention_mask is not None:
-                    inputs["decoder_attention_mask"] = decoder_attention_mask
-
-        return inputs
-
-    def _process_audio(
-        self,
-        speech: np.ndarray | list[float] | list[np.ndarray] | list[list[float]],
-        is_target: bool = False,
-        padding: bool | str | PaddingStrategy = False,
-        max_length: int | None = None,
-        truncation: bool = False,
-        pad_to_multiple_of: int | None = None,
-        return_attention_mask: bool | None = None,
-        return_tensors: str | TensorType | None = None,
-        **kwargs,
-    ) -> BatchFeature:
-        is_batched_numpy = isinstance(speech, np.ndarray) and len(speech.shape) > 1
-        if is_batched_numpy and len(speech.shape) > 2:
-            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
-        is_batched = is_batched_numpy or (
-            isinstance(speech, (list, tuple)) and (isinstance(speech[0], (np.ndarray, tuple, list)))
-        )
-
-        if is_batched:
-            speech = [np.asarray(speech, dtype=np.float32) for speech in speech]
-        elif not is_batched and not isinstance(speech, np.ndarray):
-            speech = np.asarray(speech, dtype=np.float32)
-        elif isinstance(speech, np.ndarray) and speech.dtype is np.dtype(np.float64):
-            speech = speech.astype(np.float32)
-
-        # always return batch
-        if not is_batched:
-            speech = [speech]
-
-        # needed to make pad() work on spectrogram inputs
-        feature_size_hack = self.feature_size
-
-        # convert into correct format for padding
-        if is_target:
-            features = [self._extract_mel_features(waveform) for waveform in speech]
-            encoded_inputs = BatchFeature({"input_values": features})
-            self.feature_size = self.num_mel_bins
-        else:
-            encoded_inputs = BatchFeature({"input_values": speech})
-
-        padded_inputs = self.pad(
-            encoded_inputs,
-            padding=padding,
-            max_length=max_length,
-            truncation=truncation,
-            pad_to_multiple_of=pad_to_multiple_of,
-            return_attention_mask=return_attention_mask,
-            **kwargs,
-        )
-
-        self.feature_size = feature_size_hack
-
-        # convert input values to correct format
-        input_values = padded_inputs["input_values"]
-        if not isinstance(input_values[0], np.ndarray):
-            padded_inputs["input_values"] = [np.asarray(array, dtype=np.float32) for array in input_values]
-        elif (
-            not isinstance(input_values, np.ndarray)
-            and isinstance(input_values[0], np.ndarray)
-            and input_values[0].dtype is np.dtype(np.float64)
-        ):
-            padded_inputs["input_values"] = [array.astype(np.float32) for array in input_values]
-        elif isinstance(input_values, np.ndarray) and input_values.dtype is np.dtype(np.float64):
-            padded_inputs["input_values"] = input_values.astype(np.float32)
-
-        # convert attention_mask to correct format
-        attention_mask = padded_inputs.get("attention_mask")
-        if attention_mask is not None:
-            padded_inputs["attention_mask"] = [np.asarray(array, dtype=np.int32) for array in attention_mask]
-
-        # zero-mean and unit-variance normalization
-        if not is_target and self.do_normalize:
-            attention_mask = (
-                attention_mask
-                if self._get_padding_strategies(padding, max_length=max_length) is not PaddingStrategy.DO_NOT_PAD
-                else None
-            )
-            padded_inputs["input_values"] = self.zero_mean_unit_var_norm(
-                padded_inputs["input_values"], attention_mask=attention_mask, padding_value=self.padding_value
-            )
-
-        if return_tensors is not None:
-            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
-
-        return padded_inputs
-
-    def to_dict(self) -> dict[str, Any]:
-        output = super().to_dict()
-
-        # Don't serialize these as they are derived from the other properties.
-        names = ["window", "mel_filters", "sample_size", "sample_stride", "n_fft", "n_freqs"]
-        for name in names:
-            if name in output:
-                del output[name]
-
-        return output
 
+SpeechT5FeatureExtractor = deprecated_feature_extractor(SpeechT5AudioProcessor, "SpeechT5FeatureExtractor")
 
 __all__ = ["SpeechT5FeatureExtractor"]
diff --git a/src/transformers/models/univnet/audio_processing_univnet.py b/src/transformers/models/univnet/audio_processing_univnet.py
new file mode 100644
index 000000000000..6bd6e16b2af3
--- /dev/null
+++ b/src/transformers/models/univnet/audio_processing_univnet.py
@@ -0,0 +1,89 @@
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from ...audio_processing_backends import NumpyAudioBackend
+from ...audio_utils import MelScaleConfig, SpectrogramConfig, StftConfig
+
+
+class UnivNetAudioProcessor(NumpyAudioBackend):
+    sample_rate = 24000
+    force_mono = True
+    mask_level = "audio"
+    mel_floor = 1e-9
+    compression_clip_val = 1e-5
+    compression_factor = 1.0
+    do_normalize = False
+    normalize_min = -11.512925148010254
+    normalize_max = 2.3143386840820312
+    max_length_s = 10
+    spectrogram_config = SpectrogramConfig(
+        stft_config=StftConfig(
+            n_fft=1024,
+            hop_length=256,
+            center=False,
+            window_fn="hann",
+            periodic=True,
+            power=1.0,
+        ),
+        mel_scale_config=MelScaleConfig(
+            n_mels=100,
+            f_min=0.0,
+            f_max=12000.0,
+            mel_scale="slaney",
+            norm="slaney",
+        ),
+        log_mode="log",
+        mel_floor=1e-5,
+        computation_dtype="float64",
+    )
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.num_max_samples = self.max_length_s * self.sample_rate
+
+    def _stft(self, audio, *, spectrogram_config, **kwargs):
+        # UnivNet uses reflect padding with (n_fft - hop_length) / 2 instead of center padding
+        stft_cfg = spectrogram_config.stft_config
+        pad_amount = int((stft_cfg.n_fft - stft_cfg.hop_length) / 2)
+        if audio.ndim > 1:
+            audio = np.pad(audio, ((0, 0), (pad_amount, pad_amount)), mode="reflect")
+        else:
+            audio = np.pad(audio, (pad_amount, pad_amount), mode="reflect")
+        return super()._stft(audio, spectrogram_config=spectrogram_config, **kwargs)
+
+    def _compute_magnitudes(self, stft_out, power, spectrogram_config=None):
+        # UnivNet adds mel_floor inside the sqrt: sqrt(real² + imag² + mel_floor)
+        return np.sqrt(np.real(stft_out) ** 2 + np.imag(stft_out) ** 2 + self.mel_floor)
+
+    def _apply_mel_scale(self, features, *, spectrogram_config, **kwargs):
+        # UnivNet applies mel filterbank without a floor
+        return np.matmul(self.mel_filters.T, features)
+
+    def _normalize_magnitude(self, features, *, spectrogram_config, **kwargs):
+        features = super()._normalize_magnitude(features, spectrogram_config=spectrogram_config, **kwargs)
+        if self.do_normalize:
+            features = 2 * ((features - self.normalize_min) / (self.normalize_max - self.normalize_min)) - 1
+        return features
+
+    def extract_spectrogram(self, audio, *, spectrogram_config, **kwargs):
+        features = super().extract_spectrogram(audio, spectrogram_config=spectrogram_config, **kwargs)
+        # Transpose from (..., n_mels, frames) to (..., frames, n_mels)
+        if isinstance(features, list):
+            return [np.swapaxes(f, -2, -1) for f in features]
+        return np.swapaxes(features, -2, -1)
+
+
+__all__ = ["UnivNetAudioProcessor"]
diff --git a/src/transformers/models/univnet/feature_extraction_univnet.py b/src/transformers/models/univnet/feature_extraction_univnet.py
index 84e9420a0f75..73ae758ee708 100644
--- a/src/transformers/models/univnet/feature_extraction_univnet.py
+++ b/src/transformers/models/univnet/feature_extraction_univnet.py
@@ -11,448 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Feature extractor class for UnivNetModel."""
+from ...utils.deprecation import deprecated_feature_extractor
+from .audio_processing_univnet import UnivNetAudioProcessor
 
-from typing import Any
-
-import numpy as np
-
-from ...audio_utils import mel_filter_bank, optimal_fft_length, spectrogram, window_function
-from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
-from ...feature_extraction_utils import BatchFeature
-from ...utils import PaddingStrategy, TensorType, logging
-
-
-logger = logging.get_logger(__name__)
-
-
-class UnivNetFeatureExtractor(SequenceFeatureExtractor):
-    r"""
-    Constructs a UnivNet feature extractor.
-
-    This class extracts log-mel-filter bank features from raw speech using the short time Fourier Transform (STFT). The
-    STFT implementation follows that of TacoTron 2 and Hifi-GAN.
-
-    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
-    most of the main methods. Users should refer to this superclass for more information regarding those methods.
-
-    Args:
-        feature_size (`int`, *optional*, defaults to 1):
-            The feature dimension of the extracted features.
-        sampling_rate (`int`, *optional*, defaults to 24000):
-            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
-        padding_value (`float`, *optional*, defaults to 0.0):
-            The value to pad with when applying the padding strategy defined by the `padding` argument to
-            [`UnivNetFeatureExtractor.__call__`]. Should correspond to audio silence. The `pad_end` argument to
-            `__call__` will also use this padding value.
-        do_normalize (`bool`, *optional*, defaults to `False`):
-            Whether to perform Tacotron 2 normalization on the input. Normalizing can help to significantly improve the
-            performance for some models.
-        num_mel_bins (`int`, *optional*, defaults to 100):
-            The number of mel-frequency bins in the extracted spectrogram features. This should match
-            `UnivNetModel.config.num_mel_bins`.
-        hop_length (`int`, *optional*, defaults to 256):
-            The direct number of samples between sliding windows. Otherwise referred to as "shift" in many papers. Note
-            that this is different from other audio feature extractors such as [`SpeechT5FeatureExtractor`] which take
-            the `hop_length` in ms.
-        win_length (`int`, *optional*, defaults to 1024):
-            The direct number of samples for each sliding window. Note that this is different from other audio feature
-            extractors such as [`SpeechT5FeatureExtractor`] which take the `win_length` in ms.
-        win_function (`str`, *optional*, defaults to `"hann_window"`):
-            Name for the window function used for windowing, must be accessible via `torch.{win_function}`
-        filter_length (`int`, *optional*, defaults to 1024):
-            The number of FFT components to use. If `None`, this is determined using
-            `transformers.audio_utils.optimal_fft_length`.
-        max_length_s (`int`, *optional*, defaults to 10):
-            The maximum input length of the model in seconds. This is used to pad the audio.
-        fmin (`float`, *optional*, defaults to 0.0):
-            Minimum mel frequency in Hz.
-        fmax (`float`, *optional*):
-            Maximum mel frequency in Hz. If not set, defaults to `sampling_rate / 2`.
-        mel_floor (`float`, *optional*, defaults to 1e-09):
-            Minimum value of mel frequency banks. Note that the way [`UnivNetFeatureExtractor`] uses `mel_floor` is
-            different than in [`transformers.audio_utils.spectrogram`].
-        center (`bool`, *optional*, defaults to `False`):
-            Whether to pad the waveform so that frame `t` is centered around time `t * hop_length`. If `False`, frame
-            `t` will start at time `t * hop_length`.
-        compression_factor (`float`, *optional*, defaults to 1.0):
-            The multiplicative compression factor for dynamic range compression during spectral normalization.
-        compression_clip_val (`float`, *optional*, defaults to 1e-05):
-            The clip value applied to the waveform before applying dynamic range compression during spectral
-            normalization.
-        normalize_min (`float`, *optional*, defaults to -11.512925148010254):
-            The min value used for Tacotron 2-style linear normalization. The default is the original value from the
-            Tacotron 2 implementation.
-        normalize_max (`float`, *optional*, defaults to 2.3143386840820312):
-            The max value used for Tacotron 2-style linear normalization. The default is the original value from the
-            Tacotron 2 implementation.
-        model_in_channels (`int`, *optional*, defaults to 64):
-            The number of input channels to the [`UnivNetModel`] model. This should match
-            `UnivNetModel.config.model_in_channels`.
-        pad_end_length (`int`, *optional*, defaults to 10):
-            If padding the end of each waveform, the number of spectrogram frames worth of samples to append. The
-            number of appended samples will be `pad_end_length * hop_length`.
-        return_attention_mask (`bool`, *optional*, defaults to `True`):
-            Whether or not [`~UnivNetFeatureExtractor.__call__`] should return `attention_mask`.
-    """
-
-    model_input_names = ["input_features", "noise_sequence", "padding_mask"]
-
-    def __init__(
-        self,
-        feature_size: int = 1,
-        sampling_rate: int = 24000,
-        padding_value: float = 0.0,
-        do_normalize: bool = False,
-        num_mel_bins: int = 100,
-        hop_length: int = 256,
-        win_length: int = 1024,
-        win_function: str = "hann_window",
-        filter_length: int | None = 1024,
-        max_length_s: int = 10,
-        fmin: float = 0.0,
-        fmax: float | None = None,
-        mel_floor: float = 1e-9,
-        center: bool = False,
-        compression_factor: float = 1.0,
-        compression_clip_val: float = 1e-5,
-        normalize_min: float = -11.512925148010254,
-        normalize_max: float = 2.3143386840820312,
-        model_in_channels: int = 64,
-        pad_end_length: int = 10,
-        return_attention_mask=True,
-        **kwargs,
-    ):
-        super().__init__(
-            feature_size=feature_size,
-            sampling_rate=sampling_rate,
-            padding_value=padding_value,
-            return_attention_mask=return_attention_mask,
-            **kwargs,
-        )
-
-        self.do_normalize = do_normalize
-
-        self.num_mel_bins = num_mel_bins
-        self.hop_length = hop_length
-        self.win_length = win_length
-        self.win_function = win_function
-        self.filter_length = filter_length
-        self.fmin = fmin
-        if fmax is None:
-            # Follows the librosa.filters.mel implementation
-            fmax = float(sampling_rate) / 2
-        self.fmax = fmax
-        self.mel_floor = mel_floor
-
-        self.max_length_s = max_length_s
-        self.num_max_samples = max_length_s * sampling_rate
-
-        if self.filter_length is None:
-            self.n_fft = optimal_fft_length(self.win_length)
-        else:
-            self.n_fft = self.filter_length
-        self.n_freqs = (self.n_fft // 2) + 1
-
-        self.window = window_function(window_length=self.win_length, name=self.win_function, periodic=True)
-
-        self.mel_filters = mel_filter_bank(
-            num_frequency_bins=self.n_freqs,
-            num_mel_filters=self.num_mel_bins,
-            min_frequency=self.fmin,
-            max_frequency=self.fmax,
-            sampling_rate=self.sampling_rate,
-            norm="slaney",
-            mel_scale="slaney",
-        )
-
-        self.center = center
-        self.compression_factor = compression_factor
-        self.compression_clip_val = compression_clip_val
-        self.normalize_min = normalize_min
-        self.normalize_max = normalize_max
-        self.model_in_channels = model_in_channels
-        self.pad_end_length = pad_end_length
-
-    def normalize(self, spectrogram):
-        return 2 * ((spectrogram - self.normalize_min) / (self.normalize_max - self.normalize_min)) - 1
-
-    def denormalize(self, spectrogram):
-        return self.normalize_min + (self.normalize_max - self.normalize_min) * ((spectrogram + 1) / 2)
-
-    def mel_spectrogram(self, waveform: np.ndarray) -> np.ndarray:
-        """
-        Calculates log MEL spectrograms from a batch of waveforms. Note that the input waveform(s) will be padded by
-        `int(self.n_fft - self.hop_length) / 2` on both sides using the `reflect` padding mode.
-
-        Args:
-            waveform (`np.ndarray` of shape `(length,)`):
-                The input waveform. This must be a single real-valued, mono waveform.
-
-        Returns:
-            `numpy.ndarray`: Array containing a log-mel spectrogram of shape `(num_frames, num_mel_bins)`.
-        """
-        # Do custom padding based on the official MelGAN and Hifi-GAN implementations
-        # See https://github.com/maum-ai/univnet/blob/9bb2b54838bb6d7ce767131cc7b8b61198bc7558/utils/stft.py#L84-L86
-        waveform = np.pad(
-            waveform,
-            (int((self.n_fft - self.hop_length) / 2), int((self.n_fft - self.hop_length) / 2)),
-            mode="reflect",
-        )
-
-        # Get the complex spectrogram.
-        # Note: waveform must be unbatched currently due to the implementation of spectrogram(...).
-        complex_spectrogram = spectrogram(
-            waveform,
-            window=self.window,
-            frame_length=self.n_fft,
-            hop_length=self.hop_length,
-            fft_length=self.n_fft,
-            power=None,
-            center=self.center,
-            mel_filters=None,
-            mel_floor=None,
-        )
-
-        # Apply the MEL filter bank and MEL floor manually since UnivNet uses a slightly different implementation
-        amplitude_spectrogram = np.sqrt(
-            np.real(complex_spectrogram) ** 2 + np.imag(complex_spectrogram) ** 2 + self.mel_floor
-        )
-        mel_spectrogram = np.matmul(self.mel_filters.T, amplitude_spectrogram)
-
-        # Perform spectral normalization to get the log mel spectrogram.
-        log_mel_spectrogram = np.log(
-            np.clip(mel_spectrogram, a_min=self.compression_clip_val, a_max=None) * self.compression_factor
-        )
-
-        # Return spectrogram with num_mel_bins last
-        return log_mel_spectrogram.T
-
-    def generate_noise(
-        self,
-        noise_length: int,
-        generator: np.random.Generator | None = None,
-    ) -> np.ndarray:
-        """
-        Generates a random noise sequence of standard Gaussian noise for use in the `noise_sequence` argument of
-        [`UnivNetModel.forward`].
-
-        Args:
-            spectrogram_length (`int`):
-                The length (dim 0) of the generated noise.
-            model_in_channels (`int`, *optional*, defaults to `None`):
-                The number of features (dim 1) of the generated noise. This should correspond to the
-                `model_in_channels` of the [`UnivNetGan`] model. If not set, this will default to
-                `self.config.model_in_channels`.
-            generator (`numpy.random.Generator`, *optional*, defaults to `None`)
-                An optional `numpy.random.Generator` random number generator to control noise generation. If not set, a
-                new generator with fresh entropy will be created.
-
-        Returns:
-            `numpy.ndarray`: Array containing random standard Gaussian noise of shape `(noise_length,
-            model_in_channels)`.
-        """
-        if generator is None:
-            generator = np.random.default_rng()
-
-        noise_shape = (noise_length, self.model_in_channels)
-        noise = generator.standard_normal(noise_shape, dtype=np.float32)
-
-        return noise
-
-    def batch_decode(self, waveforms, waveform_lengths=None) -> list[np.ndarray]:
-        r"""
-        Removes padding from generated audio after running [`UnivNetModel.forward`]. This returns a ragged list of 1D
-        audio waveform arrays and not a single tensor/array because in general the waveforms will have different
-        lengths after removing padding.
-
-        Args:
-            waveforms (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
-                The batched output waveforms from the [`UnivNetModel`].
-            waveform_lengths (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
-                The batched lengths of each waveform before padding.
-
-        Returns:
-            `list[np.ndarray]`: A ragged list of 1D waveform arrays with padding removed.
-        """
-        # Collapse the batched waveform tensor to a list of 1D audio waveforms
-        waveforms = [waveform.detach().to(device="cpu", copy=True).numpy() for waveform in waveforms]
-
-        if waveform_lengths is not None:
-            waveforms = [waveform[: waveform_lengths[i]] for i, waveform in enumerate(waveforms)]
-
-        return waveforms
-
-    def __call__(
-        self,
-        raw_speech: np.ndarray | list[float] | list[np.ndarray] | list[list[float]],
-        sampling_rate: int | None = None,
-        padding: bool | str | PaddingStrategy = True,
-        max_length: int | None = None,
-        truncation: bool = True,
-        pad_to_multiple_of: int | None = None,
-        return_noise: bool = True,
-        generator: np.random.Generator | None = None,
-        pad_end: bool = False,
-        pad_length: int | None = None,
-        do_normalize: str | None = None,
-        return_attention_mask: bool | None = None,
-        return_tensors: str | TensorType | None = None,
-    ) -> BatchFeature:
-        """
-        Main method to featurize and prepare for the model one or several sequence(s).
-
-        Args:
-            raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
-                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
-                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
-                stereo, i.e. single float per timestep.
-            sampling_rate (`int`, *optional*):
-                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
-                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
-                pipeline.
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
-                Select a strategy to pad the input `raw_speech` waveforms (according to the model's padding side and
-                padding index) among:
-
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-
-                If `pad_end = True`, that padding will occur before the `padding` strategy is applied.
-            max_length (`int`, *optional*):
-                Maximum length of the returned list and optionally padding length (see above).
-            truncation (`bool`, *optional*, defaults to `True`):
-                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
-            pad_to_multiple_of (`int`, *optional*):
-                If set will pad the sequence to a multiple of the provided value.
-
-                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
-                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
-            return_noise (`bool`, *optional*, defaults to `True`):
-                Whether to generate and return a noise waveform for use in [`UnivNetModel.forward`].
-            generator (`numpy.random.Generator`, *optional*, defaults to `None`):
-                An optional `numpy.random.Generator` random number generator to use when generating noise.
-            pad_end (`bool`, *optional*, defaults to `False`):
-                Whether to pad the end of each waveform with silence. This can help reduce artifacts at the end of the
-                generated audio sample; see https://github.com/seungwonpark/melgan/issues/8 for more details. This
-                padding will be done before the padding strategy specified in `padding` is performed.
-            pad_length (`int`, *optional*, defaults to `None`):
-                If padding the end of each waveform, the length of the padding in spectrogram frames. If not set, this
-                will default to `self.config.pad_end_length`.
-            do_normalize (`bool`, *optional*):
-                Whether to perform Tacotron 2 normalization on the input. Normalizing can help to significantly improve
-                the performance for some models. If not set, this will default to `self.config.do_normalize`.
-            return_attention_mask (`bool`, *optional*):
-                Whether to return the attention mask. If left to the default, will return the attention mask according
-                to the specific feature_extractor's default.
-
-                [What are attention masks?](../glossary#attention-mask)
-
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors instead of list of python integers. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.np.array` objects.
-                - `'np'`: Return Numpy `np.ndarray` objects.
-        """
-        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
-
-        if sampling_rate is not None:
-            if sampling_rate != self.sampling_rate:
-                raise ValueError(
-                    f"The model corresponding to this feature extractor: {self.__class__.__name__} was trained using a"
-                    f" sampling rate of {self.sampling_rate}. Please make sure that the provided `raw_speech` input"
-                    f" was sampled with {self.sampling_rate} and not {sampling_rate}."
-                )
-        else:
-            logger.warning(
-                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
-                "Failing to do so can result in silent errors that might be hard to debug."
-            )
-
-        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
-        if is_batched_numpy and len(raw_speech.shape) > 2:
-            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
-        is_batched = is_batched_numpy or (
-            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
-        )
-
-        if is_batched:
-            raw_speech = [np.asarray(speech, dtype=np.float32) for speech in raw_speech]
-        elif not is_batched and not isinstance(raw_speech, np.ndarray):
-            raw_speech = np.asarray(raw_speech, dtype=np.float32)
-        elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
-            raw_speech = raw_speech.astype(np.float32)
-
-        # always return batch
-        if not is_batched:
-            raw_speech = [np.asarray(raw_speech, dtype=np.float32)]
-
-        # Pad end to reduce artifacts
-        if pad_end:
-            pad_length = pad_length if pad_length is not None else self.pad_end_length
-            raw_speech = [
-                np.pad(waveform, (0, pad_length * self.hop_length), constant_values=self.padding_value)
-                for waveform in raw_speech
-            ]
-
-        batched_speech = BatchFeature({"input_features": raw_speech})
-
-        padded_inputs = self.pad(
-            batched_speech,
-            padding=padding,
-            max_length=max_length if max_length is not None else self.num_max_samples,
-            truncation=truncation,
-            pad_to_multiple_of=pad_to_multiple_of,
-            return_attention_mask=return_attention_mask,
-        )
-
-        # make sure list is in array format
-        # input_features = padded_inputs.get("input_features").transpose(2, 0, 1)
-        input_features = padded_inputs.get("input_features")
-
-        mel_spectrograms = [self.mel_spectrogram(waveform) for waveform in input_features]
-
-        if isinstance(input_features[0], list):
-            batched_speech["input_features"] = [np.asarray(mel, dtype=np.float32) for mel in mel_spectrograms]
-        else:
-            batched_speech["input_features"] = [mel.astype(np.float32) for mel in mel_spectrograms]
-
-        # convert attention_mask to correct format
-        attention_mask = padded_inputs.get("attention_mask")
-        if attention_mask is not None:
-            batched_speech["padding_mask"] = [np.asarray(array, dtype=np.int32) for array in attention_mask]
-
-        if return_noise:
-            noise = [
-                self.generate_noise(spectrogram.shape[0], generator)
-                for spectrogram in batched_speech["input_features"]
-            ]
-            batched_speech["noise_sequence"] = noise
-
-        if do_normalize:
-            batched_speech["input_features"] = [
-                self.normalize(spectrogram) for spectrogram in batched_speech["input_features"]
-            ]
-
-        if return_tensors is not None:
-            batched_speech = batched_speech.convert_to_tensors(return_tensors)
-
-        return batched_speech
-
-    def to_dict(self) -> dict[str, Any]:
-        output = super().to_dict()
-
-        # Don't serialize these as they are derived from the other properties.
-        names = ["window", "mel_filters", "n_fft", "n_freqs", "num_max_samples"]
-        for name in names:
-            if name in output:
-                del output[name]
-
-        return output
 
+UnivNetFeatureExtractor = deprecated_feature_extractor(UnivNetAudioProcessor, "UnivNetFeatureExtractor")
 
 __all__ = ["UnivNetFeatureExtractor"]
diff --git a/src/transformers/models/vibevoice_acoustic_tokenizer/audio_processing_vibevoice_acoustic_tokenizer.py b/src/transformers/models/vibevoice_acoustic_tokenizer/audio_processing_vibevoice_acoustic_tokenizer.py
new file mode 100644
index 000000000000..866113b39b82
--- /dev/null
+++ b/src/transformers/models/vibevoice_acoustic_tokenizer/audio_processing_vibevoice_acoustic_tokenizer.py
@@ -0,0 +1,38 @@
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from ...audio_processing_backends import TorchAudioBackend
+
+
+class VibevoiceAcousticTokenizerAudioProcessor(TorchAudioBackend):
+    sample_rate = 24000
+    force_mono = True
+    add_channel_dim = True
+
+    target_dB_FS = -25
+    eps = 1e-6
+
+    def _process_audio(self, audio_el):
+        audio_el = super()._process_audio(audio_el)
+        rms = torch.sqrt(torch.mean(audio_el**2))
+        audio_el = audio_el * (10 ** (self.target_dB_FS / 20) / (rms + self.eps))
+        max_val = torch.max(torch.abs(audio_el))
+        if max_val > 1.0:
+            audio_el = audio_el / (max_val + self.eps)
+        return audio_el
+
+
+__all__ = ["VibevoiceAcousticTokenizerAudioProcessor"]
diff --git a/src/transformers/models/vibevoice_acoustic_tokenizer/feature_extraction_vibevoice_acoustic_tokenizer.py b/src/transformers/models/vibevoice_acoustic_tokenizer/feature_extraction_vibevoice_acoustic_tokenizer.py
index 6e0c82762283..0f38cd9df814 100644
--- a/src/transformers/models/vibevoice_acoustic_tokenizer/feature_extraction_vibevoice_acoustic_tokenizer.py
+++ b/src/transformers/models/vibevoice_acoustic_tokenizer/feature_extraction_vibevoice_acoustic_tokenizer.py
@@ -11,134 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from ...utils.deprecation import deprecated_feature_extractor
+from .audio_processing_vibevoice_acoustic_tokenizer import VibevoiceAcousticTokenizerAudioProcessor
 
-from ...audio_utils import AudioInput, make_list_of_audio
-from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
-from ...feature_extraction_utils import BatchFeature
-from ...utils import PaddingStrategy, logging
-from ...utils.import_utils import is_torch_available, requires
-
-
-if is_torch_available():
-    import torch
-
-logger = logging.get_logger(__name__)
-
-
-@requires(backends=("torch",))
-class VibeVoiceAcousticTokenizerFeatureExtractor(SequenceFeatureExtractor):
-    r"""
-    Constructs a VibeVoiceAcousticTokenizer feature extractor.
-
-    Args:
-        feature_size (`int`, *optional*, defaults to 1):
-            The number of channels.
-        sampling_rate (`int`, *optional*, defaults to 24000):
-            The sampling rate at which the audio waveform should be digitalized, expressed in hertz (Hz).
-        padding_value (`float`, *optional*, defaults to 0.0):
-            The value that is used for padding.
-        normalize_audio (`bool`, *optional*, defaults to `True`):
-            Whether to normalize audio to a target dB FS.
-        target_dB_FS (`float`, *optional*, defaults to -25):
-            Target dB FS for normalization.
-        eps (`float`, *optional*, defaults to 1e-06):
-            A small value to avoid division by zero when normalizing.
-
-    """
-
-    model_input_names = ["input_values", "padding_mask"]
-
-    def __init__(
-        self,
-        feature_size=1,
-        sampling_rate=24000,
-        padding_value=0.0,
-        normalize_audio=True,
-        target_dB_FS=-25,
-        eps=1e-6,
-        **kwargs,
-    ):
-        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
-
-        self.normalize_audio = normalize_audio
-        self.target_dB_FS = target_dB_FS
-        self.eps = eps
-
-    def __call__(
-        self,
-        audio: AudioInput,
-        sampling_rate: int | None = None,
-        padding: bool | str | PaddingStrategy | None = True,
-        pad_to_multiple_of: int | None = None,
-        return_attention_mask: bool | None = True,
-    ) -> BatchFeature:
-        """
-        Args:
-            audio (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`:
-                The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a torch tensor,
-                a list of numpy arrays or a list of torch tensors.
-            sampling_rate (`int`, *optional*):
-                The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass
-                `sampling_rate` at the forward call to prevent silent errors.
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
-                Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                index) among:
-
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            pad_to_multiple_of (`int`, *optional*):
-                If set will pad the sequence to a multiple of the provided value.
-
-        """
-        if sampling_rate is not None:
-            if sampling_rate != self.sampling_rate:
-                raise ValueError(
-                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
-                    f" {self.sampling_rate}. Please make sure that the provided audio input was sampled with"
-                    f" {self.sampling_rate} and not {sampling_rate}."
-                )
-        else:
-            logger.warning(
-                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
-                "Failing to do so can result in silent errors that might be hard to debug."
-            )
-
-        # Ensure batch of mono tensors
-        audio = make_list_of_audio(audio)
-        for idx, example in enumerate(audio):
-            example = torch.tensor(example, dtype=torch.float32)
-            if example.ndim != 1:
-                raise ValueError(f"Audio should be mono, got shape: {example.shape}")
-            audio[idx] = example
-
-        if self.normalize_audio:
-            for idx, example in enumerate(audio):
-                rms = torch.sqrt(torch.mean(example**2))
-                example *= 10 ** (self.target_dB_FS / 20) / (rms + self.eps)
-                max_val = torch.max(torch.abs(example))
-                if max_val > 1.0:
-                    example = example / (max_val + self.eps)
-                audio[idx] = example
-
-        output_values = BatchFeature({"input_values": audio})
-        if padding or pad_to_multiple_of:
-            output_values = self.pad(
-                output_values,
-                padding=padding,
-                pad_to_multiple_of=pad_to_multiple_of,
-                return_attention_mask=return_attention_mask,
-            )
-            if return_attention_mask:
-                output_values["padding_mask"] = output_values.pop("attention_mask")
-
-        # add channel dimension
-        output_values["input_values"] = output_values["input_values"][:, None, :]
-
-        return output_values
 
+VibeVoiceAcousticTokenizerFeatureExtractor = deprecated_feature_extractor(
+    VibevoiceAcousticTokenizerAudioProcessor, "VibeVoiceAcousticTokenizerFeatureExtractor"
+)
 
 __all__ = ["VibeVoiceAcousticTokenizerFeatureExtractor"]
diff --git a/src/transformers/models/voxtral_realtime/audio_processing_voxtral_realtime.py b/src/transformers/models/voxtral_realtime/audio_processing_voxtral_realtime.py
new file mode 100644
index 000000000000..59ff6ad89176
--- /dev/null
+++ b/src/transformers/models/voxtral_realtime/audio_processing_voxtral_realtime.py
@@ -0,0 +1,62 @@
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from ...audio_processing_backends import TorchAudioBackend
+from ...audio_utils import MelScaleConfig, SpectrogramConfig, StftConfig
+
+
+class VoxtralRealtimeAudioProcessor(TorchAudioBackend):
+    sample_rate = 16000
+    force_mono = True
+    spectrogram_config = SpectrogramConfig(
+        stft_config=StftConfig(
+            n_fft=400,
+            hop_length=160,
+            power=2.0,
+        ),
+        mel_scale_config=MelScaleConfig(
+            n_mels=128,
+            mel_scale="slaney",
+            norm="slaney",
+            computation_dtype="float64",
+        ),
+        log_mode="log10",
+        skip_last_frame=True,
+    )
+    global_log_mel_max = 1.5
+
+    def _apply_mel_scale(self, features, *, spectrogram_config, **kwargs):
+        mel_filters = self.mel_filters.to(device=features.device)
+        return torch.clamp(torch.matmul(mel_filters.T, features), min=spectrogram_config.mel_floor)
+
+    def _normalize_magnitude(self, features, *, spectrogram_config, **kwargs):
+        features = super()._normalize_magnitude(features, spectrogram_config=spectrogram_config, **kwargs)
+
+        if self.global_log_mel_max is not None:
+            spec_max = torch.tensor(self.global_log_mel_max, device=features.device, dtype=features.dtype)
+        else:
+            spec_max = features.amax(dim=(-2, -1), keepdim=True)
+        features = torch.maximum(features, spec_max - 8.0)
+        features = (features + 4.0) / 4.0
+        return features
+
+    def _get_features_lengths(self, audio_lengths, spectrogram_config, include_center_frame=False):
+        stft_cfg = spectrogram_config.stft_config
+        win_length = stft_cfg.win_length or stft_cfg.n_fft
+        return (audio_lengths - win_length) // stft_cfg.hop_length + 1
+
+
+__all__ = ["VoxtralRealtimeAudioProcessor"]
diff --git a/src/transformers/models/voxtral_realtime/feature_extraction_voxtral_realtime.py b/src/transformers/models/voxtral_realtime/feature_extraction_voxtral_realtime.py
index 58355f3c0d7c..09e49995be51 100644
--- a/src/transformers/models/voxtral_realtime/feature_extraction_voxtral_realtime.py
+++ b/src/transformers/models/voxtral_realtime/feature_extraction_voxtral_realtime.py
@@ -11,236 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from ...utils.deprecation import deprecated_feature_extractor
+from .audio_processing_voxtral_realtime import VoxtralRealtimeAudioProcessor
 
-import numpy as np
-import torch
-
-from ...audio_utils import mel_filter_bank
-from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
-from ...feature_extraction_utils import BatchFeature
-from ...utils import TensorType, logging
-from ...utils.import_utils import requires
-
-
-logger = logging.get_logger(__name__)
-
-
-@requires(backends=("torch",))
-class VoxtralRealtimeFeatureExtractor(SequenceFeatureExtractor):
-    r"""
-    Constructs a VOXTRAL_REALTIME feature extractor.
-
-    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
-    most of the main methods. Users should refer to this superclass for more information regarding those methods.
-
-    This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the `Short Time
-    Fourier Transform` which should match pytorch's `torch.stft` equivalent.
-
-    Args:
-            feature_size (`int`, *optional*, defaults to 128):
-                The feature dimension of the extracted features.
-            sampling_rate (`int`, *optional*, defaults to 16000):
-                The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
-            hop_length (`int`, *optional*, defaults to 160):
-                Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
-            n_fft (`int`, *optional*, defaults to 512):
-                Size of the Fourier transform.
-            win_length (`int`, *optional*, defaults to 400):
-                The window length for the STFT computation.
-            padding_value (`float`, *optional*, defaults to 0.0):
-                Padding value used to pad the audio. Should correspond to silences.
-    """
-
-    model_input_names = ["input_features", "attention_mask"]
-
-    def __init__(
-        self,
-        feature_size=128,
-        sampling_rate=16000,
-        hop_length=160,
-        n_fft=400,
-        win_length=400,
-        padding_value=0.0,
-        global_log_mel_max=1.5,
-        **kwargs,
-    ):
-        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
-
-        self.hop_length = hop_length
-        self.n_fft = n_fft
-        self.win_length = win_length
-        self.mel_filters = mel_filter_bank(
-            num_frequency_bins=1 + n_fft // 2,
-            num_mel_filters=feature_size,
-            min_frequency=0.0,
-            max_frequency=8000.0,
-            sampling_rate=sampling_rate,
-            norm="slaney",
-            mel_scale="slaney",
-        )
-        self.global_log_mel_max = global_log_mel_max
-
-    def _torch_extract_fbank_features(self, waveform, device: str = "cpu", center: bool = True):
-        window = torch.hann_window(self.n_fft, device=device)
-        stft = torch.stft(waveform, self.n_fft, self.hop_length, window=window, return_complex=True, center=center)
-        magnitudes = stft[..., :-1].abs() ** 2
-
-        mel_filters = torch.from_numpy(self.mel_filters).to(device, torch.float32)
-        mel_spec = mel_filters.T @ magnitudes
-
-        log_spec = torch.clamp(mel_spec, min=1e-10).log10()
-        if self.global_log_mel_max is not None:
-            log_spec_max = torch.tensor(
-                self.global_log_mel_max,
-                device=log_spec.device,
-                dtype=log_spec.dtype,
-            )
-        else:
-            log_spec_max = log_spec.max()
-
-        log_spec = torch.maximum(log_spec, log_spec_max - 8.0)
-        log_spec = (log_spec + 4.0) / 4.0
-        if device != "cpu":
-            log_spec = log_spec.detach().cpu()
-        return log_spec
-
-    def __call__(
-        self,
-        raw_speech: np.ndarray | list[float] | list[np.ndarray] | list[list[float]],
-        truncation: bool = False,
-        pad_to_multiple_of: int | None = None,
-        return_tensors: str | TensorType | None = None,
-        return_attention_mask: bool | None = None,
-        padding: str | None = "longest",
-        max_length: int | None = None,
-        sampling_rate: int | None = None,
-        do_normalize: bool | None = None,
-        device: str | None = "cpu",
-        return_token_timestamps: bool | None = None,
-        center: bool = True,
-        **kwargs,
-    ) -> BatchFeature:
-        """
-        Main method to featurize and prepare for the model one or several sequence(s). Implementation uses PyTorch for
-        the STFT computation if available, otherwise a slower NumPy based one.
-
-        Args:
-            raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
-                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
-                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
-                stereo, i.e. single float per timestep.
-            truncation (`bool`, *optional*, default to `True`):
-                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
-            pad_to_multiple_of (`int`, *optional*, defaults to None):
-                If set will pad the sequence to a multiple of the provided value.
-
-                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
-                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
-            return_attention_mask (`bool`, *optional*):
-                Whether to return the attention mask. If left to the default, will return the attention mask according
-                to the specific feature_extractor's default.
-
-                [What are attention masks?](../glossary#attention-mask)
-
-                <Tip>
-
-                For Parakeet models, `attention_mask` should always be passed for batched inference, to avoid subtle
-                bugs.
-
-                </Tip>
-
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors instead of list of python integers. Acceptable values are:
-
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return Numpy `np.ndarray` objects.
-            sampling_rate (`int`, *optional*):
-                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
-                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
-                pipeline.
-            padding_value (`float`, *optional*, defaults to 0.0):
-                The value that is used to fill the padding values / vectors.
-            do_normalize (`bool`, *optional*, defaults to `False`):
-                Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
-                improve the performance of the model.
-            device (`str`, *optional*, defaults to `'cpu'`):
-                Specifies the device for computation of the log-mel spectrogram of audio signals in the
-                `_torch_extract_fbank_features` method. (e.g., "cpu", "cuda")
-            return_token_timestamps (`bool`, *optional*, defaults to `None`):
-                Deprecated. Use `return_attention_mask` instead from which the number of frames can be inferred.
-
-                Whether or not to return the number of frames of the input raw_speech.
-                These num_frames can be used by the model to compute word level timestamps.
-            center (`bool`, *optional*, defaults to `True`):
-                Whether to use centering for the STFT computation.
-        """
-        if sampling_rate is not None:
-            if sampling_rate != self.sampling_rate:
-                raise ValueError(
-                    f"The model corresponding to this feature extractor: {self.__class__.__name__} was trained using a"
-                    f" sampling rate of {self.sampling_rate}. Please make sure that the provided `raw_speech` input"
-                    f" was sampled with {self.sampling_rate} and not {sampling_rate}."
-                )
-        else:
-            logger.warning(
-                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
-                "Failing to do so can result in silent errors that might be hard to debug."
-            )
-
-        # Convert to torch tensor
-        if isinstance(raw_speech, np.ndarray):
-            raw_speech = torch.tensor(raw_speech)
-        elif isinstance(raw_speech, (list, tuple)):
-            if isinstance(raw_speech[0], (list, np.ndarray)):
-                raw_speech = [torch.tensor(speech) for speech in raw_speech]
-            else:  # list[float]
-                raw_speech = torch.tensor(raw_speech)
-
-        is_batched_torch = isinstance(raw_speech, torch.Tensor) and len(raw_speech.shape) > 1
-        if is_batched_torch and len(raw_speech.shape) > 2:
-            logger.warning(
-                f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
-                "We will take the mean of the channels to convert to mono."
-            )
-            raw_speech = raw_speech.mean(-1)
-
-        is_batched_sequence = isinstance(raw_speech, (list, tuple))
-        if is_batched_sequence:
-            for speech in raw_speech:
-                if len(speech.shape) > 1:
-                    logger.warning(
-                        f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
-                        "We will take the mean of the channels to convert to mono."
-                    )
-                    speech = speech.mean(-1)
-
-        if is_batched_torch or is_batched_sequence:
-            raw_speech = [speech[:, None].to(torch.float32) for speech in raw_speech]
-        else:
-            raw_speech = [raw_speech[:, None].to(torch.float32)]
-
-        batched_speech = BatchFeature({"input_features": raw_speech})
-        padded_inputs = self.pad(
-            batched_speech,
-            padding=padding,
-            max_length=max_length,
-            truncation=truncation,
-            pad_to_multiple_of=pad_to_multiple_of,
-            return_attention_mask=return_attention_mask,
-            return_tensors="pt",
-        )
-        input_features = padded_inputs.input_features.squeeze(-1)
-        input_features = self._torch_extract_fbank_features(input_features, device, center)
-        data = {
-            "input_features": input_features.to(torch.float32),
-        }
-
-        if return_attention_mask:
-            attention_mask = padded_inputs.attention_mask[:, self.win_length - 1 :: self.hop_length]
-            data["attention_mask"] = attention_mask.to(torch.bool)
-
-        return BatchFeature(data=data, tensor_type=return_tensors)
 
+VoxtralRealtimeFeatureExtractor = deprecated_feature_extractor(
+    VoxtralRealtimeAudioProcessor, "VoxtralRealtimeFeatureExtractor"
+)
 
 __all__ = ["VoxtralRealtimeFeatureExtractor"]
diff --git a/src/transformers/models/wav2vec2/audio_processing_wav2vec2.py b/src/transformers/models/wav2vec2/audio_processing_wav2vec2.py
new file mode 100644
index 000000000000..66467620f39d
--- /dev/null
+++ b/src/transformers/models/wav2vec2/audio_processing_wav2vec2.py
@@ -0,0 +1,34 @@
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from ...audio_processing_backends import TorchAudioBackend
+
+
+class Wav2Vec2AudioProcessor(TorchAudioBackend):
+    sample_rate = 16000
+    force_mono = True
+    do_normalize = True
+
+    def _process_audio(self, audio_el):
+        audio_el = super()._process_audio(audio_el)
+
+        if self.do_normalize:
+            audio_el = (audio_el - audio_el.mean()) / torch.sqrt(audio_el.var(correction=0) + 1e-7)
+
+        return audio_el
+
+
+__all__ = ["Wav2Vec2AudioProcessor"]
diff --git a/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py b/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py
index dea2f3af5b48..bc4c8fdee07e 100644
--- a/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py
@@ -11,229 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
-Feature extractor class for Wav2Vec2
-"""
+from ...utils.deprecation import deprecated_feature_extractor
+from .audio_processing_wav2vec2 import Wav2Vec2AudioProcessor
 
-import numpy as np
-
-from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
-from ...feature_extraction_utils import BatchFeature
-from ...utils import PaddingStrategy, TensorType, logging
-
-
-logger = logging.get_logger(__name__)
-
-
-class Wav2Vec2FeatureExtractor(SequenceFeatureExtractor):
-    r"""
-    Constructs a Wav2Vec2 feature extractor.
-
-    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
-    most of the main methods. Users should refer to this superclass for more information regarding those methods.
-
-    Args:
-        feature_size (`int`, *optional*, defaults to 1):
-            The feature dimension of the extracted features.
-        sampling_rate (`int`, *optional*, defaults to 16000):
-            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
-        padding_value (`float`, *optional*, defaults to 0.0):
-            The value that is used to fill the padding values.
-        do_normalize (`bool`, *optional*, defaults to `True`):
-            Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
-            improve the performance for some models, *e.g.*,
-            [wav2vec2-lv60](https://huggingface.co/models?search=lv60).
-        return_attention_mask (`bool`, *optional*, defaults to `False`):
-            Whether or not [`~Wav2Vec2FeatureExtractor.__call__`] should return `attention_mask`.
-
-            <Tip>
-
-            Wav2Vec2 models that have set `config.feat_extract_norm == "group"`, such as
-            [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base-960h), have **not** been trained using
-            `attention_mask`. For such models, `input_values` should simply be padded with 0 and no `attention_mask`
-            should be passed.
-
-            For Wav2Vec2 models that have set `config.feat_extract_norm == "layer"`, such as
-            [wav2vec2-lv60](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self), `attention_mask` should be
-            passed for batched inference.
-
-            </Tip>"""
-
-    model_input_names = ["input_values", "attention_mask"]
-
-    def __init__(
-        self,
-        feature_size=1,
-        sampling_rate=16000,
-        padding_value=0.0,
-        return_attention_mask=False,
-        do_normalize=True,
-        **kwargs,
-    ):
-        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
-        self.return_attention_mask = return_attention_mask
-        self.do_normalize = do_normalize
-
-    @staticmethod
-    def zero_mean_unit_var_norm(
-        input_values: list[np.ndarray], attention_mask: list[np.ndarray], padding_value: float = 0.0
-    ) -> list[np.ndarray]:
-        """
-        Every array in the list is normalized to have zero mean and unit variance
-        """
-        if attention_mask is not None:
-            attention_mask = np.array(attention_mask, np.int32)
-            normed_input_values = []
-
-            for vector, length in zip(input_values, attention_mask.sum(-1)):
-                normed_slice = (vector - vector[:length].mean()) / np.sqrt(vector[:length].var() + 1e-7)
-                if length < normed_slice.shape[0]:
-                    normed_slice[length:] = padding_value
-
-                normed_input_values.append(normed_slice)
-        else:
-            normed_input_values = [(x - x.mean()) / np.sqrt(x.var() + 1e-7) for x in input_values]
-
-        return normed_input_values
-
-    def __call__(
-        self,
-        raw_speech: np.ndarray | list[float] | list[np.ndarray] | list[list[float]],
-        padding: bool | str | PaddingStrategy = False,
-        max_length: int | None = None,
-        truncation: bool = False,
-        pad_to_multiple_of: int | None = None,
-        return_attention_mask: bool | None = None,
-        return_tensors: str | TensorType | None = None,
-        sampling_rate: int | None = None,
-        **kwargs,
-    ) -> BatchFeature:
-        """
-        Main method to featurize and prepare for the model one or several sequence(s).
-
-        Args:
-            raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
-                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
-                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
-                stereo, i.e. single float per timestep.
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
-                Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                index) among:
-
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            max_length (`int`, *optional*):
-                Maximum length of the returned list and optionally padding length (see above).
-            truncation (`bool`):
-                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
-            pad_to_multiple_of (`int`, *optional*):
-                If set will pad the sequence to a multiple of the provided value.
-
-                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
-                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
-            return_attention_mask (`bool`, *optional*):
-                Whether to return the attention mask. If left to the default, will return the attention mask according
-                to the specific feature_extractor's default.
-
-                [What are attention masks?](../glossary#attention-mask)
-
-                <Tip>
-
-                Wav2Vec2 models that have set `config.feat_extract_norm == "group"`, such as
-                [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base-960h), have **not** been trained using
-                `attention_mask`. For such models, `input_values` should simply be padded with 0 and no
-                `attention_mask` should be passed.
-
-                For Wav2Vec2 models that have set `config.feat_extract_norm == "layer"`, such as
-                [wav2vec2-lv60](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self), `attention_mask` should
-                be passed for batched inference.
-
-                </Tip>
-
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors instead of list of python integers. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return Numpy `np.ndarray` objects.
-            sampling_rate (`int`, *optional*):
-                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
-                `sampling_rate` at the forward call to prevent silent errors.
-            padding_value (`float`, *optional*, defaults to 0.0):
-        """
-
-        if sampling_rate is not None:
-            if sampling_rate != self.sampling_rate:
-                raise ValueError(
-                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
-                    f" {self.sampling_rate}. Please make sure that the provided `raw_speech` input was sampled with"
-                    f" {self.sampling_rate} and not {sampling_rate}."
-                )
-        else:
-            logger.warning(
-                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
-                "Failing to do so can result in silent errors that might be hard to debug."
-            )
-
-        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
-        if is_batched_numpy and len(raw_speech.shape) > 2:
-            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
-        is_batched = is_batched_numpy or (
-            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
-        )
-
-        # always return batch
-        if not is_batched:
-            raw_speech = [raw_speech]
-
-        # convert into correct format for padding
-        encoded_inputs = BatchFeature({"input_values": raw_speech})
-
-        padded_inputs = self.pad(
-            encoded_inputs,
-            padding=padding,
-            max_length=max_length,
-            truncation=truncation,
-            pad_to_multiple_of=pad_to_multiple_of,
-            return_attention_mask=return_attention_mask,
-        )
-
-        # convert input values to correct format
-        input_values = padded_inputs["input_values"]
-        if not isinstance(input_values[0], np.ndarray):
-            padded_inputs["input_values"] = [np.asarray(array, dtype=np.float32) for array in input_values]
-        elif (
-            not isinstance(input_values, np.ndarray)
-            and isinstance(input_values[0], np.ndarray)
-            and input_values[0].dtype is np.dtype(np.float64)
-        ):
-            padded_inputs["input_values"] = [array.astype(np.float32) for array in input_values]
-        elif isinstance(input_values, np.ndarray) and input_values.dtype is np.dtype(np.float64):
-            padded_inputs["input_values"] = input_values.astype(np.float32)
-
-        # convert attention_mask to correct format
-        attention_mask = padded_inputs.get("attention_mask")
-        if attention_mask is not None:
-            padded_inputs["attention_mask"] = [np.asarray(array, dtype=np.int32) for array in attention_mask]
-
-        # zero-mean and unit-variance normalization
-        if self.do_normalize:
-            attention_mask = (
-                attention_mask
-                if self._get_padding_strategies(padding, max_length=max_length) is not PaddingStrategy.DO_NOT_PAD
-                else None
-            )
-            padded_inputs["input_values"] = self.zero_mean_unit_var_norm(
-                padded_inputs["input_values"], attention_mask=attention_mask, padding_value=self.padding_value
-            )
-
-        if return_tensors is not None:
-            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
-
-        return padded_inputs
 
+Wav2Vec2FeatureExtractor = deprecated_feature_extractor(Wav2Vec2AudioProcessor, "Wav2Vec2FeatureExtractor")
 
 __all__ = ["Wav2Vec2FeatureExtractor"]
diff --git a/src/transformers/models/whisper/audio_processing_whisper.py b/src/transformers/models/whisper/audio_processing_whisper.py
new file mode 100644
index 000000000000..0a7f5bffa9be
--- /dev/null
+++ b/src/transformers/models/whisper/audio_processing_whisper.py
@@ -0,0 +1,57 @@
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from ...audio_processing_backends import TorchAudioBackend
+from ...audio_utils import MelScaleConfig, SpectrogramConfig, StftConfig
+
+
+class WhisperAudioProcessor(TorchAudioBackend):
+    sample_rate = 16000
+    force_mono = True
+    return_padding_mask = False
+    truncation = True
+    max_length = 480000  # 30 seconds at 16000 Hz
+    spectrogram_config = SpectrogramConfig(
+        stft_config=StftConfig(
+            n_fft=400,
+            hop_length=160,
+            power=2.0,
+        ),
+        mel_scale_config=MelScaleConfig(
+            n_mels=80,
+            mel_scale="slaney",
+            norm="slaney",
+            computation_dtype="float64",
+        ),
+        log_mode="log10",
+        skip_last_frame=True,
+    )
+
+    def _apply_mel_scale(self, features, *, spectrogram_config, **kwargs):
+        mel_filters = self.mel_filters.to(device=features.device)
+        return torch.clamp(torch.matmul(mel_filters.T, features), min=spectrogram_config.mel_floor)
+
+    def _normalize_magnitude(self, features, *, spectrogram_config, **kwargs):
+        features = super()._normalize_magnitude(features, spectrogram_config=spectrogram_config, **kwargs)
+
+        max_vals = features.amax(dim=(-2, -1), keepdim=True)
+        features = torch.maximum(features, max_vals - 8.0)
+        features = (features + 4.0) / 4.0
+
+        return features
+
+
+__all__ = ["WhisperAudioProcessor"]
diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index 4151a3824dfd..4e4f49df3c2d 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -11,335 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
-Feature extractor class for Whisper
-"""
+from ...utils.deprecation import deprecated_feature_extractor
+from .audio_processing_whisper import WhisperAudioProcessor
 
-import numpy as np
-
-from ... import is_torch_available
-from ...audio_utils import mel_filter_bank, spectrogram, window_function
-from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
-from ...feature_extraction_utils import BatchFeature
-from ...utils import TensorType, logging
-
-
-if is_torch_available():
-    import torch
-
-logger = logging.get_logger(__name__)
-
-
-class WhisperFeatureExtractor(SequenceFeatureExtractor):
-    r"""
-    Constructs a Whisper feature extractor.
-
-    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
-    most of the main methods. Users should refer to this superclass for more information regarding those methods.
-
-    This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the `Short Time
-    Fourier Transform` which should match pytorch's `torch.stft` equivalent.
-
-    Args:
-        feature_size (`int`, *optional*, defaults to 80):
-            The feature dimension of the extracted features.
-        sampling_rate (`int`, *optional*, defaults to 16000):
-            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
-        hop_length (`int`, *optional*, defaults to 160):
-            Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
-        chunk_length (`int`, *optional*, defaults to 30):
-            The maximum number of chunks of `sampling_rate` samples used to trim and pad longer or shorter audio
-            sequences.
-        n_fft (`int`, *optional*, defaults to 400):
-            Size of the Fourier transform.
-        padding_value (`float`, *optional*, defaults to 0.0):
-            Padding value used to pad the audio. Should correspond to silences.
-        dither (`float`, *optional*, defaults to 0.0):
-            Adds dithering. In other words, adds a small Gaussian noise to each frame.
-            E.g. use 0.0001 to add dithering with a normal distribution centered
-            around 0.0 with standard deviation 0.0001 (assuming [-1,+1] range of raw_speech).
-            The value 0.0 means no dithering.
-            Dithering has similar effect as `spectrogram(mel_floor=...)`. It reduces
-            the high log_mel_fbank values for signals with hard-zero sections,
-            when VAD cutoff is present in the signal.
-    """
-
-    model_input_names = ["input_features"]
-
-    def __init__(
-        self,
-        feature_size=80,
-        sampling_rate=16000,
-        hop_length=160,
-        chunk_length=30,
-        n_fft=400,
-        padding_value=0.0,
-        dither=0.0,
-        return_attention_mask=False,  # pad inputs to max length with silence token (zero) and no attention mask
-        **kwargs,
-    ):
-        super().__init__(
-            feature_size=feature_size,
-            sampling_rate=sampling_rate,
-            padding_value=padding_value,
-            return_attention_mask=return_attention_mask,
-            **kwargs,
-        )
-        self.n_fft = n_fft
-        self.hop_length = hop_length
-        self.chunk_length = chunk_length
-        self.n_samples = chunk_length * sampling_rate
-        self.nb_max_frames = self.n_samples // hop_length
-        self.sampling_rate = sampling_rate
-        self.dither = dither
-        self.mel_filters = mel_filter_bank(
-            num_frequency_bins=1 + n_fft // 2,
-            num_mel_filters=feature_size,
-            min_frequency=0.0,
-            max_frequency=8000.0,
-            sampling_rate=sampling_rate,
-            norm="slaney",
-            mel_scale="slaney",
-        )
-
-    def _np_extract_fbank_features(self, waveform_batch: np.ndarray, device: str) -> np.ndarray:
-        """
-        Compute the log-mel spectrogram of the provided audio, gives similar results to Whisper's original torch
-        implementation with 1e-5 tolerance.
-        """
-        if device != "cpu":
-            raise ValueError(
-                f"Got device `{device}` for feature extraction, but feature extraction on CUDA accelerator "
-                "devices requires torch, which is not installed. Either set `device='cpu'`, or "
-                "install torch according to the official instructions: https://pytorch.org/get-started/locally/"
-            )
-        log_spec_batch = []
-        for waveform in waveform_batch:
-            log_spec = spectrogram(
-                waveform,
-                window_function(self.n_fft, "hann"),
-                frame_length=self.n_fft,
-                hop_length=self.hop_length,
-                power=2.0,
-                dither=self.dither,
-                mel_filters=self.mel_filters,
-                log_mel="log10",
-            )
-            log_spec = log_spec[:, :-1]
-            log_spec = np.maximum(log_spec, log_spec.max() - 8.0)
-            log_spec = (log_spec + 4.0) / 4.0
-            log_spec_batch.append(log_spec)
-        log_spec_batch = np.array(log_spec_batch)
-        return log_spec_batch
-
-    def _torch_extract_fbank_features(self, waveform: np.ndarray, device: str = "cpu") -> np.ndarray:
-        """
-        Compute the log-mel spectrogram of the audio using PyTorch's GPU-accelerated STFT implementation with batching,
-        yielding results similar to cpu computing with 1e-5 tolerance.
-        """
-        waveform = torch.from_numpy(waveform).to(device, torch.float32)
-        window = torch.hann_window(self.n_fft, device=device)
-
-        # Note: it would be better to dither the chunked waveform,
-        # so overlapping signal does not get the same dithering.
-        # But, chunking is happening inside pytorch, so it is here.
-        if self.dither != 0.0:
-            waveform += self.dither * torch.randn(waveform.shape, dtype=waveform.dtype, device=waveform.device)
-
-        stft = torch.stft(waveform, self.n_fft, self.hop_length, window=window, return_complex=True)
-        magnitudes = stft[..., :-1].abs() ** 2
-
-        mel_filters = torch.from_numpy(self.mel_filters).to(device, torch.float32)
-        mel_spec = mel_filters.T @ magnitudes
-
-        log_spec = torch.clamp(mel_spec, min=1e-10).log10()
-        if waveform.dim() == 2:
-            max_val = log_spec.max(dim=2, keepdim=True)[0].max(dim=1, keepdim=True)[0]
-            log_spec = torch.maximum(log_spec, max_val - 8.0)
-        else:
-            log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
-        log_spec = (log_spec + 4.0) / 4.0
-        if device != "cpu":
-            log_spec = log_spec.detach().cpu()
-        return log_spec.numpy()
-
-    @staticmethod
-    # Copied from transformers.models.wav2vec2.feature_extraction_wav2vec2.Wav2Vec2FeatureExtractor.zero_mean_unit_var_norm
-    def zero_mean_unit_var_norm(
-        input_values: list[np.ndarray], attention_mask: list[np.ndarray], padding_value: float = 0.0
-    ) -> list[np.ndarray]:
-        """
-        Every array in the list is normalized to have zero mean and unit variance
-        """
-        if attention_mask is not None:
-            attention_mask = np.array(attention_mask, np.int32)
-            normed_input_values = []
-
-            for vector, length in zip(input_values, attention_mask.sum(-1)):
-                normed_slice = (vector - vector[:length].mean()) / np.sqrt(vector[:length].var() + 1e-7)
-                if length < normed_slice.shape[0]:
-                    normed_slice[length:] = padding_value
-
-                normed_input_values.append(normed_slice)
-        else:
-            normed_input_values = [(x - x.mean()) / np.sqrt(x.var() + 1e-7) for x in input_values]
-
-        return normed_input_values
-
-    def __call__(
-        self,
-        raw_speech: np.ndarray | list[float] | list[np.ndarray] | list[list[float]],
-        truncation: bool = True,
-        pad_to_multiple_of: int | None = None,
-        return_tensors: str | TensorType | None = None,
-        return_attention_mask: bool | None = None,
-        padding: str | None = "max_length",
-        max_length: int | None = None,
-        sampling_rate: int | None = None,
-        do_normalize: bool | None = None,
-        device: str | None = "cpu",
-        **kwargs,
-    ) -> BatchFeature:
-        """Main method to featurize and prepare for the model one or several sequence(s). Implementation uses PyTorch
-        for the STFT computation if available, otherwise a slower NumPy based one.
-
-        Args:
-            raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
-                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
-                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
-                stereo, i.e. single float per timestep.
-            truncation (`bool`, *optional*, default to `True`):
-                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
-            pad_to_multiple_of (`int`, *optional*, defaults to None):
-                If set will pad the sequence to a multiple of the provided value.
-
-                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
-                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors instead of list of python integers. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return Numpy `np.ndarray` objects.
-            return_attention_mask (`bool`, *optional*):
-                Whether to return the attention mask. If left to the default, will return the attention mask according
-                to the specific feature_extractor's default.
-
-                [What are attention masks?](../glossary#attention-mask)
-
-                <Tip>
-
-                For Whisper models, `attention_mask` should always be passed for batched inference, to avoid subtle
-                bugs.
-
-                </Tip>
-            padding (`str` or [`~utils.PaddingStrategy`], *optional*, defaults to `'max_length'`):
-                Activates and controls padding. Accepts the following values:
-
-                - `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence is
-                  provided).
-                - `'max_length'` (default): Pad to a maximum length specified with the argument `max_length` or to the
-                  maximum acceptable input length for the model if that argument is not provided.
-                - `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different lengths).
-            max_length (`int`, *optional*):
-                Controls the maximum length to use by one of the truncation/padding parameters.
-
-                If left unset or set to `None`, this will use the predefined model maximum length if a maximum length
-                is required by one of the truncation/padding parameters. If the model has no specific maximum input
-                length (like XLNet) truncation/padding to a maximum length will be deactivated.
-            sampling_rate (`int`, *optional*):
-                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
-                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
-                pipeline.
-            do_normalize (`bool`, *optional*, defaults to `False`):
-                Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
-                improve the performance of the model.
-            device (`str`, *optional*, defaults to `'cpu'`):
-                Specifies the device for computation of the log-mel spectrogram of audio signals in the
-                `_torch_extract_fbank_features` method. (e.g., "cpu", "cuda")
-            **kwargs: Not supported by WhisperFeatureExtractor.__call__() and ignored.
-        """
-        if sampling_rate is not None:
-            if sampling_rate != self.sampling_rate:
-                raise ValueError(
-                    f"The model corresponding to this feature extractor: {self.__class__.__name__} was trained using a"
-                    f" sampling rate of {self.sampling_rate}. Please make sure that the provided `raw_speech` input"
-                    f" was sampled with {self.sampling_rate} and not {sampling_rate}."
-                )
-        else:
-            logger.warning(
-                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
-                "Failing to do so can result in silent errors that might be hard to debug."
-            )
-
-        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
-        if is_batched_numpy and len(raw_speech.shape) > 2:
-            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
-        is_batched = is_batched_numpy or (
-            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
-        )
-
-        if is_batched:
-            raw_speech = [np.asarray([speech], dtype=np.float32).T for speech in raw_speech]
-        elif not is_batched and not isinstance(raw_speech, np.ndarray):
-            raw_speech = np.asarray(raw_speech, dtype=np.float32)
-        elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
-            raw_speech = raw_speech.astype(np.float32)
-
-        # always return batch
-        if not is_batched:
-            raw_speech = [np.asarray([raw_speech]).T]
-
-        batched_speech = BatchFeature({"input_features": raw_speech})
-
-        # convert into correct format for padding
-
-        padded_inputs = self.pad(
-            batched_speech,
-            padding=padding,
-            max_length=max_length if max_length else self.n_samples,
-            truncation=truncation,
-            pad_to_multiple_of=pad_to_multiple_of,
-            return_attention_mask=return_attention_mask or do_normalize,
-        )
-
-        # zero-mean and unit-variance normalization
-        if do_normalize:
-            padded_inputs["input_features"] = self.zero_mean_unit_var_norm(
-                padded_inputs["input_features"],
-                attention_mask=padded_inputs["attention_mask"],
-                padding_value=self.padding_value,
-            )
-            padded_inputs["input_features"] = np.stack(padded_inputs["input_features"], axis=0)
-
-        # make sure list is in array format
-        input_features = padded_inputs.get("input_features").transpose(2, 0, 1)
-
-        extract_fbank_features = (
-            self._torch_extract_fbank_features if is_torch_available() else self._np_extract_fbank_features
-        )
-        input_features = extract_fbank_features(input_features[0], device)
-
-        if isinstance(input_features[0], list):
-            padded_inputs["input_features"] = [np.asarray(feature, dtype=np.float32) for feature in input_features]
-
-        else:
-            padded_inputs["input_features"] = input_features
-
-        if return_attention_mask:
-            # rescale from sample (48000) to feature (3000)
-            rescaled_attention_mask = padded_inputs["attention_mask"][:, :: self.hop_length]
-
-            # The STFT computation produces L//hop_length + 1 frames, but we skip the last frame (see `_torch_extract_fbank_features`).
-            # This means we need to trim the rescaled attention mask to match the actual number of frames (L//hop_length) when the input length
-            # is not perfectly divisible by the hop length.
-            if padded_inputs["attention_mask"].shape[1] % self.hop_length != 0:
-                rescaled_attention_mask = rescaled_attention_mask[:, :-1]
-            padded_inputs["attention_mask"] = rescaled_attention_mask
-
-        if return_tensors is not None:
-            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
-
-        return padded_inputs
 
+WhisperFeatureExtractor = deprecated_feature_extractor(WhisperAudioProcessor, "WhisperFeatureExtractor")
 
 __all__ = ["WhisperFeatureExtractor"]
diff --git a/src/transformers/numpy_mel_spectrogram.py b/src/transformers/numpy_mel_spectrogram.py
new file mode 100644
index 000000000000..cd90215b78a1
--- /dev/null
+++ b/src/transformers/numpy_mel_spectrogram.py
@@ -0,0 +1,413 @@
+"""NumPy implementation of mel spectrogram computation."""
+
+import numpy as np
+import librosa
+
+
+# --- Frequency conversion utilities ---
+
+def hertz_to_mel(freq, mel_scale="htk"):
+    if mel_scale == "htk":
+        return 2595.0 * np.log10(1.0 + (freq / 700.0))
+    elif mel_scale == "kaldi":
+        return 1127.0 * np.log(1.0 + (freq / 700.0))
+    # slaney
+    min_log_hertz = 1000.0
+    min_log_mel = 15.0
+    logstep = 27.0 / np.log(6.4)
+    mels = 3.0 * freq / 200.0
+    if isinstance(freq, np.ndarray):
+        log_region = freq >= min_log_hertz
+        mels[log_region] = min_log_mel + np.log(freq[log_region] / min_log_hertz) * logstep
+    elif freq >= min_log_hertz:
+        mels = min_log_mel + np.log(freq / min_log_hertz) * logstep
+    return mels
+
+
+def mel_to_hertz(mels, mel_scale="htk"):
+    if mel_scale == "htk":
+        return 700.0 * (np.power(10, mels / 2595.0) - 1.0)
+    elif mel_scale == "kaldi":
+        return 700.0 * (np.exp(mels / 1127.0) - 1.0)
+    # slaney
+    min_log_hertz = 1000.0
+    min_log_mel = 15.0
+    logstep = np.log(6.4) / 27.0
+    freq = 200.0 * mels / 3.0
+    if isinstance(mels, np.ndarray):
+        log_region = mels >= min_log_mel
+        freq[log_region] = min_log_hertz * np.exp(logstep * (mels[log_region] - min_log_mel))
+    elif mels >= min_log_mel:
+        freq = min_log_hertz * np.exp(logstep * (mels - min_log_mel))
+    return freq
+
+
+# --- Filter bank ---
+
+def _create_triangular_filter_bank(fft_freqs, filter_freqs):
+    filter_diff = np.diff(filter_freqs)
+    slopes = np.expand_dims(filter_freqs, 0) - np.expand_dims(fft_freqs, 1)
+    down_slopes = -slopes[:, :-2] / filter_diff[:-1]
+    up_slopes = slopes[:, 2:] / filter_diff[1:]
+    return np.maximum(0, np.minimum(down_slopes, up_slopes))
+
+
+def mel_filter_bank(
+    num_frequency_bins,
+    num_mel_filters,
+    min_frequency,
+    max_frequency,
+    sampling_rate,
+    norm=None,
+    mel_scale="htk",
+    triangularize_in_mel_space=False,
+    frequency_bin_mode="rfft",
+):
+    mel_min = hertz_to_mel(min_frequency, mel_scale=mel_scale)
+    mel_max = hertz_to_mel(max_frequency, mel_scale=mel_scale)
+    mel_freqs = np.linspace(mel_min, mel_max, num_mel_filters + 2)
+    filter_freqs = mel_to_hertz(mel_freqs, mel_scale=mel_scale)
+
+    n_fft = (num_frequency_bins - 1) * 2
+
+    if triangularize_in_mel_space:
+        fft_bin_width = sampling_rate / n_fft
+        fft_freqs = hertz_to_mel(
+            fft_bin_width * np.arange(num_frequency_bins), mel_scale=mel_scale
+        )
+        filter_freqs = mel_freqs
+    elif frequency_bin_mode == "rfft":
+        fft_freqs = np.fft.rfftfreq(n=n_fft, d=1.0 / sampling_rate)
+    else:
+        fft_freqs = np.linspace(0, sampling_rate // 2, num_frequency_bins)
+
+    mel_filters = _create_triangular_filter_bank(fft_freqs, filter_freqs)
+
+    if norm == "slaney":
+        enorm = 2.0 / (
+            filter_freqs[2 : num_mel_filters + 2] - filter_freqs[:num_mel_filters]
+        )
+        mel_filters *= np.expand_dims(enorm, 0)
+
+    return mel_filters
+
+
+# --- Window ---
+
+def window_function(window_length, name="hann_window", periodic=True):
+    N = window_length + 1 if periodic else window_length
+    fac = np.linspace(-np.pi, np.pi, N)
+    if name in ("hann", "hann_window"):
+        w = 0.5 + 0.5 * np.cos(fac)
+    elif name in ("hamming", "hamming_window"):
+        w = 0.54 + 0.46 * np.cos(fac)
+    elif name == "boxcar":
+        w = np.ones(N)
+    elif name == "povey":
+        w = (0.5 + 0.5 * np.cos(fac)) ** 0.85
+    else:
+        raise ValueError(f"Unknown window function '{name}'")
+    return w[:window_length] if periodic else w
+
+
+# --- Sub-methods ---
+
+def _prepare_window_and_framing(window, win_length, n_fft, needs_manual_framing):
+    if needs_manual_framing and win_length < n_fft:
+        frame_length = win_length
+    else:
+        if win_length < n_fft:
+            left_pad = (n_fft - win_length) // 2
+            right_pad = n_fft - win_length - left_pad
+            window = np.pad(window, (left_pad, right_pad))
+        frame_length = n_fft
+    return window, frame_length
+
+
+def _frame_waveform(waveform, frame_length, hop_length, n_fft, center, pad_mode):
+    squeezed = waveform.ndim == 1
+    if squeezed:
+        waveform = waveform[np.newaxis, :]
+    if center:
+        # Use librosa-compatible split-padding to match their STFT exactly
+        # This replicates librosa's optimization to avoid copying the entire signal
+        start_k = int(np.ceil(n_fft // 2 / hop_length))
+        tail_k = (waveform.shape[-1] + n_fft // 2 - n_fft) // hop_length + 1
+
+        if tail_k <= start_k:
+            # Head and tail overlap, use simple full padding
+            waveform = np.pad(waveform, ((0, 0), (frame_length // 2, frame_length // 2)), mode=pad_mode)
+            num_frames = 1 + (waveform.shape[-1] - frame_length) // hop_length
+            frame_starts = np.arange(num_frames) * hop_length
+            frame_indices = frame_starts[:, np.newaxis] + np.arange(frame_length)
+            frames = waveform[:, frame_indices]  # (batch, num_frames, frame_length)
+        else:
+            # Split padding: handle head and tail separately like librosa
+            # Pre-padding: left pad only
+            padding = [(0, 0) for _ in range(waveform.ndim)]
+            padding[-1] = (frame_length // 2, 0)
+            y_pre = np.pad(
+                waveform[..., : (start_k - 1) * hop_length - n_fft // 2 + n_fft + 1],
+                padding,
+                mode=pad_mode,
+            )
+            y_frames_pre = librosa.util.frame(y_pre, frame_length=frame_length, hop_length=hop_length)
+            y_frames_pre = y_frames_pre[..., :start_k]
+            y_frames_pre = np.moveaxis(y_frames_pre, -2, -1)  # (batch, frame_length, num_frames) -> (batch, num_frames, frame_length)
+            extra = y_frames_pre.shape[-2]
+
+            # Post-padding: right pad only
+            padding[-1] = (0, frame_length // 2)
+            y_post = np.pad(
+                waveform[..., (tail_k) * hop_length - n_fft // 2 :],
+                padding,
+                mode=pad_mode,
+            )
+            y_frames_post = librosa.util.frame(y_post, frame_length=frame_length, hop_length=hop_length)
+            y_frames_post = np.moveaxis(y_frames_post, -2, -1)  # (batch, frame_length, num_frames) -> (batch, num_frames, frame_length)
+            extra += y_frames_post.shape[-2]
+
+            # Middle: no padding
+            start = start_k * hop_length - n_fft // 2
+            y_frames_middle = librosa.util.frame(
+                waveform[..., start:], frame_length=frame_length, hop_length=hop_length
+            )
+            y_frames_middle = np.moveaxis(y_frames_middle, -2, -1)  # (batch, frame_length, num_frames) -> (batch, num_frames, frame_length)
+
+            # Total frames
+            num_frames = y_frames_pre.shape[-2] + y_frames_middle.shape[-2] + y_frames_post.shape[-2]
+
+            # Concatenate frames
+            frames = np.concatenate([y_frames_pre, y_frames_middle, y_frames_post], axis=-2)
+    else:
+        # No centering: no padding
+        num_frames = 1 + (waveform.shape[-1] - frame_length) // hop_length
+        frame_starts = np.arange(num_frames) * hop_length
+        frame_indices = frame_starts[:, np.newaxis] + np.arange(frame_length)
+        frames = waveform[:, frame_indices]  # (batch, num_frames, frame_length)
+
+    if squeezed:
+        frames = frames.squeeze(0)
+    return frames, num_frames
+
+
+def _apply_frame_processing(frames, *, dither=0.0, preemphasis=None, remove_dc_offset=False):
+    compute_dtype = frames.dtype
+    if dither != 0.0:
+        frames = frames + dither * np.random.randn(*frames.shape).astype(compute_dtype)
+    if remove_dc_offset:
+        frames = frames - frames.mean(axis=-1, keepdims=True)
+    if preemphasis is not None:
+        preemph_src = preemphasis * frames[..., :-1]
+        frames[..., 1:] = frames[..., 1:] - preemph_src
+        frames[..., 0] = frames[..., 0] * (1 - preemphasis)
+    return frames
+
+
+def _windowed_fft(frames, window, fft_length, power, normalized):
+    """Apply window, compute FFT, and return power spectrogram of shape (..., freq, time)."""
+    frames = frames * window
+    spec = np.fft.rfft(frames, n=fft_length, axis=-1).astype(np.complex64)
+    if normalized:
+        spec = spec / np.sqrt(np.sum(window**2)).astype(spec.real.dtype)
+    spec = np.abs(spec, dtype=np.float64) ** power
+    return np.moveaxis(spec, -1, -2)
+
+
+def _apply_mel_scale(
+    spectrogram: np.ndarray,
+    mel_filters: np.ndarray,
+    mel_floor: float = 1e-10,
+) -> np.ndarray:
+    """Apply mel filterbank to a spectrogram.
+
+    Args:
+        spectrogram: Power spectrogram of shape (..., freq, time).
+        mel_filters: Mel filterbank of shape (freq, n_mels).
+        mel_floor: Minimum value for clamping.
+
+    Returns:
+        Mel spectrogram of shape (..., n_mels, time).
+    """
+    # (n_mels, freq) @ (..., freq, time) -> (..., n_mels, time)
+    mel_spec = np.matmul(mel_filters.T, spectrogram)
+    return np.maximum(mel_floor, mel_spec)
+
+
+# --- Main function ---
+
+def mel_spectrogram(
+    waveform: np.ndarray,
+    sampling_rate: int,
+    *,
+    n_fft: int = 400,
+    win_length: int | None = None,
+    hop_length: int | None = None,
+    window_fn: str = "hann_window",
+    wkwargs: dict | None = None,
+    power: float = 2.0,
+    center: bool = True,
+    pad_mode: str = "reflect",
+    normalized: bool = False,
+    periodic: bool = True,
+    # mel scale kwargs
+    n_mels: int = 128,
+    f_min: float = 0.0,
+    f_max: float | None = None,
+    mel_scale: str = "htk",
+    norm: str | None = None,
+    triangularize_in_mel_space: bool = False,
+    # kaldi-specific kwargs
+    dither: float = 0.0,
+    preemphasis: float | None = None,
+    remove_dc_offset: bool = False,
+    mel_floor: float = 1e-10,
+) -> np.ndarray:
+    """Compute mel spectrogram using NumPy.
+
+    Args:
+        waveform: Input waveform of shape (..., time).
+        sampling_rate: Sample rate in Hz.
+
+    Returns:
+        Mel spectrogram of shape (..., n_mels, time).
+    """
+    if f_max is None:
+        f_max = sampling_rate / 2.0
+
+    # --- STFT ---
+    if win_length is None:
+        win_length = n_fft
+    if hop_length is None:
+        hop_length = win_length // 2
+    window = window_function(win_length, name=window_fn, periodic=periodic)
+
+    needs_manual_framing = (dither != 0.0) or (preemphasis is not None) or remove_dc_offset
+    window, frame_length = _prepare_window_and_framing(window, win_length, n_fft, needs_manual_framing)
+
+    is_1d = waveform.ndim == 1
+    if is_1d:
+        waveform = waveform[np.newaxis, :]
+    leading_shape = waveform.shape[:-1]
+    waveform = waveform.reshape(-1, waveform.shape[-1])
+    frames, num_frames = _frame_waveform(waveform, frame_length, hop_length, n_fft, center, pad_mode)
+    compute_dtype = np.result_type(waveform.dtype, window.dtype)
+    frames = frames.astype(compute_dtype, copy=False)
+    frames = _apply_frame_processing(frames, dither=dither, preemphasis=preemphasis, remove_dc_offset=remove_dc_offset)
+    spectrogram = _windowed_fft(frames, window, n_fft, power, normalized)
+
+    num_frequency_bins = n_fft // 2 + 1
+    spectrogram = spectrogram.reshape(*leading_shape, num_frequency_bins, num_frames)
+    if is_1d:
+        spectrogram = spectrogram.squeeze(0)
+
+    num_frequency_bins = spectrogram.shape[-2]
+    mel_fb = mel_filter_bank(
+        num_frequency_bins, n_mels, f_min, f_max, sampling_rate,
+        norm=norm, mel_scale=mel_scale,
+        triangularize_in_mel_space=triangularize_in_mel_space,
+    )
+
+    return _apply_mel_scale(spectrogram, mel_fb, mel_floor=mel_floor)
+
+
+class MelSpectrogram:
+    """Cached mel spectrogram — precomputes window and mel filterbank.
+
+    Same API and exact same results as the functional ``mel_spectrogram``, but
+    avoids recomputing the window and mel filterbank on every call.
+
+    Usage::
+
+        transform = MelSpectrogram(sampling_rate=16000, n_fft=1024, n_mels=80)
+        mel = transform(waveform)             # fast repeated calls
+    """
+
+    def __init__(
+        self,
+        sampling_rate: int,
+        *,
+        n_fft: int = 400,
+        win_length: int | None = None,
+        hop_length: int | None = None,
+        window_fn: str = "hann_window",
+        wkwargs: dict | None = None,
+        power: float = 2.0,
+        center: bool = True,
+        pad_mode: str = "reflect",
+        normalized: bool = False,
+        periodic: bool = True,
+        n_mels: int = 128,
+        f_min: float = 0.0,
+        f_max: float | None = None,
+        mel_scale: str = "htk",
+        norm: str | None = None,
+        triangularize_in_mel_space: bool = False,
+        dither: float = 0.0,
+        preemphasis: float | None = None,
+        remove_dc_offset: bool = False,
+        mel_floor: float = 1e-10,
+    ):
+        self.sampling_rate = sampling_rate
+        self.n_fft = n_fft
+        self.win_length = win_length if win_length is not None else n_fft
+        self.hop_length = hop_length if hop_length is not None else self.win_length // 2
+        self.power = power
+        self.center = center
+        self.pad_mode = pad_mode
+        self.normalized = normalized
+        self.periodic = periodic
+        self.n_mels = n_mels
+        self.f_min = f_min
+        self.f_max = f_max if f_max is not None else sampling_rate / 2.0
+        self.mel_floor = mel_floor
+        self.dither = dither
+        self.preemphasis = preemphasis
+        self.remove_dc_offset = remove_dc_offset
+        self.window_fn = window_fn
+
+        # Precompute window
+        needs_manual_framing = (dither != 0.0) or (preemphasis is not None) or remove_dc_offset
+        window = window_function(self.win_length, name=window_fn, periodic=periodic)
+        self._window, self._frame_length = _prepare_window_and_framing(
+            window, self.win_length, n_fft, needs_manual_framing,
+        )
+
+        # Precompute mel filterbank
+        num_frequency_bins = n_fft // 2 + 1
+        self._mel_fb = mel_filter_bank(
+            num_frequency_bins, n_mels, self.f_min, self.f_max, sampling_rate,
+            norm=norm, mel_scale=mel_scale,
+            triangularize_in_mel_space=triangularize_in_mel_space,
+        )
+
+    def __call__(self, waveform: np.ndarray) -> np.ndarray:
+        """Compute mel spectrogram.
+
+        Args:
+            waveform: Input of shape (..., time).
+
+        Returns:
+            Mel spectrogram of shape (..., n_mels, time).
+        """
+        is_1d = waveform.ndim == 1
+        if is_1d:
+            waveform = waveform[np.newaxis, :]
+        leading_shape = waveform.shape[:-1]
+        waveform = waveform.reshape(-1, waveform.shape[-1])
+        frames, num_frames = _frame_waveform(
+            waveform, self._frame_length, self.hop_length, self.n_fft, self.center, self.pad_mode,
+        )
+        compute_dtype = np.result_type(waveform.dtype, self._window.dtype)
+        frames = frames.astype(compute_dtype, copy=False)
+        frames = _apply_frame_processing(
+            frames, dither=self.dither, preemphasis=self.preemphasis, remove_dc_offset=self.remove_dc_offset,
+        )
+        spectrogram = _windowed_fft(frames, self._window, self.n_fft, self.power, self.normalized)
+
+        num_frequency_bins = self.n_fft // 2 + 1
+        spectrogram = spectrogram.reshape(*leading_shape, num_frequency_bins, num_frames)
+        if is_1d:
+            spectrogram = spectrogram.squeeze(0)
+
+        return _apply_mel_scale(spectrogram, self._mel_fb, mel_floor=self.mel_floor)
diff --git a/src/transformers/preprocessing_base.py b/src/transformers/preprocessing_base.py
new file mode 100644
index 000000000000..d994f4811e32
--- /dev/null
+++ b/src/transformers/preprocessing_base.py
@@ -0,0 +1,470 @@
+# Copyright 2020 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Base mixin for image processors and feature extractors, providing shared
+save/load/serialization logic.
+"""
+
+import copy
+import json
+import os
+from copy import deepcopy
+from typing import Any, TypeVar
+
+import numpy as np
+from huggingface_hub import create_repo, is_offline_mode
+
+from .dynamic_module_utils import custom_object_save
+from .utils import (
+    PROCESSOR_NAME,
+    PushToHubMixin,
+    logging,
+    safe_load_json_file,
+)
+from .utils.hub import cached_file
+
+
+logger = logging.get_logger(__name__)
+
+PreprocessingMixinType = TypeVar("PreprocessingMixinType", bound="PreprocessingMixin")
+
+
+class PreprocessingMixin(PushToHubMixin):
+    """
+    Base mixin providing saving/loading functionality shared by
+    ImageProcessingMixin, AudioProcessingMixin and FeatureExtractionMixin.
+
+    Subclasses must set the following class attributes:
+        _config_name: str            — config file name (e.g. IMAGE_PROCESSOR_NAME)
+        _type_key: str               — key added in to_dict() (e.g. "image_processor_type")
+        _nested_config_keys: list    — keys to check in processor_config.json
+        _auto_class_default: str     — default auto class for register_for_auto_class
+        _file_type_label: str        — label for user-agent / error messages
+    Optional:
+        _excluded_dict_keys: set     — keys to drop from to_dict() output
+        _extra_init_pops: list       — extra keys to pop in __init__
+        _config_filename_kwarg: str  — kwarg name that can override the config filename
+        _subfolder_default: str      — default for the subfolder kwarg
+    """
+
+    _auto_class = None
+
+    # --- Must be overridden by subclasses ---
+    _config_name: str
+    _type_key: str
+    _nested_config_keys: list[str] = []
+    _auto_class_default: str
+    _file_type_label: str
+
+    # --- Optional overrides ---
+    _excluded_dict_keys: set[str] = set()
+    _extra_init_pops: list[str] = []
+    _config_filename_kwarg: str | None = None
+    _subfolder_default: str | None = ""
+
+    def __init__(self, **kwargs):
+        """Set elements of `kwargs` as attributes."""
+        for key in self._extra_init_pops:
+            kwargs.pop(key, None)
+        # Pop "processor_class", should not be saved in config
+        kwargs.pop("processor_class", None)
+
+        if hasattr(self, "valid_kwargs") and hasattr(self.valid_kwargs, "__annotations__"):
+            self._init_kwargs_from_valid_kwargs(kwargs)
+
+        # Additional attributes without default values
+        for key, value in kwargs.items():
+            try:
+                setattr(self, key, value)
+            except AttributeError as err:
+                logger.error(f"Can't set {key} with value {value} for {self}")
+                raise err
+
+    def _init_kwargs_from_valid_kwargs(self, kwargs: dict):
+        """
+        Initialize instance attributes from `valid_kwargs` annotations.
+
+        For each key in `self.valid_kwargs.__annotations__`, pops it from `kwargs`
+        and sets it on the instance (or deep-copies the class default).
+        Also sets `self._valid_kwargs_names`.
+        """
+        for key in self.valid_kwargs.__annotations__:
+            kwarg = kwargs.pop(key, None)
+            if kwarg is not None:
+                setattr(self, key, kwarg)
+            else:
+                setattr(self, key, deepcopy(getattr(self, key, None)))
+        self._valid_kwargs_names = list(self.valid_kwargs.__annotations__.keys())
+
+    def filter_out_unused_kwargs(self, kwargs: dict) -> dict:
+        """
+        Filter out the unused kwargs from the kwargs dictionary.
+        """
+        if self.unused_kwargs is None:
+            return kwargs
+
+        for kwarg_name in self.unused_kwargs:
+            if kwarg_name in kwargs:
+                logger.warning_once(f"This processor does not use the `{kwarg_name}` parameter. It will be ignored.")
+                kwargs.pop(kwarg_name)
+        return kwargs
+
+    @classmethod
+    def from_dict(cls, config_dict: dict[str, Any], **kwargs):
+        """
+        Instantiates a processor from a Python dictionary of parameters.
+
+        Args:
+            config_dict (`dict[str, Any]`):
+                Dictionary that will be used to instantiate the processor object.
+            kwargs (`dict[str, Any]`):
+                Additional parameters from which to initialize the processor object.
+
+        Returns:
+            A processor of type [`~PreprocessingMixin`].
+        """
+        config_dict = config_dict.copy()
+        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
+
+        # Use valid_kwargs pattern when available (image/audio processors)
+        if hasattr(cls, "valid_kwargs") and hasattr(cls.valid_kwargs, "__annotations__"):
+            config_dict.update({k: v for k, v in kwargs.items() if k in cls.valid_kwargs.__annotations__})
+            processor = cls(**config_dict)
+
+            # Apply extra kwargs to instance (BC for remote code)
+            extra_keys = []
+            for key in reversed(list(kwargs.keys())):
+                if hasattr(processor, key) and key not in cls.valid_kwargs.__annotations__:
+                    setattr(processor, key, kwargs.pop(key, None))
+                    extra_keys.append(key)
+            if extra_keys:
+                logger.warning_once(
+                    f"Processor {cls.__name__}: kwargs {extra_keys} were applied for backward compatibility. "
+                    f"To avoid this warning, add them to valid_kwargs."
+                )
+        else:
+            processor = cls(**config_dict)
+
+        logger.info(f"Processor {processor}")
+        if return_unused_kwargs:
+            return processor, kwargs
+        else:
+            return processor
+
+    @classmethod
+    def from_pretrained(
+        cls: type[PreprocessingMixinType],
+        pretrained_model_name_or_path: str | os.PathLike,
+        cache_dir: str | os.PathLike | None = None,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: str | bool | None = None,
+        revision: str = "main",
+        **kwargs,
+    ) -> PreprocessingMixinType:
+        r"""
+        Instantiate a processor from a pretrained model name or path.
+
+        Args:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                This can be either:
+
+                - a string, the *model id* of a pretrained processor hosted inside a model repo on
+                  huggingface.co.
+                - a path to a *directory* containing a processor file saved using the
+                  [`~PreprocessingMixin.save_pretrained`] method, e.g., `./my_model_directory/`.
+                - a path or url to a saved processor JSON *file*, e.g.,
+                  `./my_model_directory/preprocessor_config.json`.
+            cache_dir (`str` or `os.PathLike`, *optional*):
+                Path to a directory in which a downloaded pretrained model processor should be cached if the
+                standard cache should not be used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force to (re-)download the processor files and override the cached versions if
+                they exist.
+            token (`str` or `bool`, *optional*):
+                The token to use as HTTP bearer authorization for remote files.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use.
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                If `False`, then this function returns just the final processor object. If `True`, then this
+                functions returns a `Tuple(processor, unused_kwargs)` where *unused_kwargs* is a dictionary
+                consisting of the key/value pairs whose keys are not processor attributes.
+            kwargs (`dict[str, Any]`, *optional*):
+                The values in kwargs of any keys which are processor attributes will be used to override the
+                loaded values.
+
+        Returns:
+            A processor of type [`~PreprocessingMixin`].
+        """
+        kwargs["cache_dir"] = cache_dir
+        kwargs["force_download"] = force_download
+        kwargs["local_files_only"] = local_files_only
+        kwargs["revision"] = revision
+
+        if token is not None:
+            kwargs["token"] = token
+
+        config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        return cls.from_dict(config_dict, **kwargs)
+
+    def save_pretrained(self, save_directory: str | os.PathLike, push_to_hub: bool = False, **kwargs):
+        """
+        Save a processor object to the directory `save_directory`, so that it can be re-loaded using the
+        [`~PreprocessingMixin.from_pretrained`] class method.
+
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the processor JSON file will be saved (will be created if it does not exist).
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Hugging Face model hub after saving it.
+            kwargs (`dict[str, Any]`, *optional*):
+                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        if os.path.isfile(save_directory):
+            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
+
+        os.makedirs(save_directory, exist_ok=True)
+
+        if push_to_hub:
+            commit_message = kwargs.pop("commit_message", None)
+            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
+            repo_id = create_repo(repo_id, exist_ok=True, **kwargs).repo_id
+            files_timestamps = self._get_files_timestamps(save_directory)
+
+        # If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
+        # loaded from the Hub.
+        if self._auto_class is not None:
+            custom_object_save(self, save_directory, config=self)
+
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_file = os.path.join(save_directory, self._config_name)
+
+        self.to_json_file(output_file)
+        logger.info(f"{self._file_type_label} saved in {output_file}")
+
+        if push_to_hub:
+            self._upload_modified_files(
+                save_directory,
+                repo_id,
+                files_timestamps,
+                commit_message=commit_message,
+                token=kwargs.get("token"),
+            )
+
+        return [output_file]
+
+    @classmethod
+    def _get_config_dict(
+        cls, pretrained_model_name_or_path: str | os.PathLike, **kwargs
+    ) -> tuple[dict[str, Any], dict[str, Any]]:
+        """
+        From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
+        processor using `from_dict`.
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
+
+        Returns:
+            `tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the processor object.
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        token = kwargs.pop("token", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", cls._subfolder_default)
+
+        # Allow overriding the config filename via a kwarg (e.g. image_processor_filename)
+        if cls._config_filename_kwarg is not None:
+            config_filename = kwargs.pop(cls._config_filename_kwarg, cls._config_name)
+        else:
+            config_filename = cls._config_name
+
+        from_pipeline = kwargs.pop("_from_pipeline", None)
+        from_auto_class = kwargs.pop("_from_auto", False)
+
+        user_agent = {"file_type": cls._file_type_label, "from_auto_class": from_auto_class}
+        if from_pipeline is not None:
+            user_agent["using_pipeline"] = from_pipeline
+
+        if is_offline_mode() and not local_files_only:
+            logger.info("Offline mode: forcing local_files_only=True")
+            local_files_only = True
+
+        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+        is_local = os.path.isdir(pretrained_model_name_or_path)
+        if os.path.isdir(pretrained_model_name_or_path):
+            config_file = os.path.join(pretrained_model_name_or_path, config_filename)
+        if os.path.isfile(pretrained_model_name_or_path):
+            resolved_config_file = pretrained_model_name_or_path
+            resolved_processor_file = None
+            is_local = True
+        else:
+            config_file = config_filename
+            try:
+                resolved_processor_file = cached_file(
+                    pretrained_model_name_or_path,
+                    filename=PROCESSOR_NAME,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    user_agent=user_agent,
+                    revision=revision,
+                    subfolder=subfolder,
+                    _raise_exceptions_for_missing_entries=False,
+                )
+                resolved_config_file = cached_file(
+                    pretrained_model_name_or_path,
+                    filename=config_file,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    user_agent=user_agent,
+                    revision=revision,
+                    subfolder=subfolder,
+                    _raise_exceptions_for_missing_entries=False,
+                )
+            except OSError:
+                # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
+                # the original exception.
+                raise
+            except Exception:
+                # For any other exception, we throw a generic error.
+                raise OSError(
+                    f"Can't load {cls._file_type_label} for '{pretrained_model_name_or_path}'. If you were trying to load"
+                    " it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
+                    f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
+                    f" directory containing a {config_filename} file"
+                )
+
+        # Load config dict. Priority goes as (nested config if found -> standalone config)
+        # We are downloading both configs because almost all models have a `processor_config.json` but
+        # not all of these are nested. We need to check if it was saved recebtly as nested or if it is legacy style
+        config_dict = None
+        if resolved_processor_file is not None:
+            processor_dict = safe_load_json_file(resolved_processor_file)
+            for nested_key in cls._nested_config_keys:
+                if nested_key in processor_dict:
+                    config_dict = processor_dict[nested_key]
+                    break
+
+        if resolved_config_file is not None and config_dict is None:
+            config_dict = safe_load_json_file(resolved_config_file)
+
+        if config_dict is None:
+            raise OSError(
+                f"Can't load {cls._file_type_label} for '{pretrained_model_name_or_path}'. If you were trying to load"
+                " it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
+                f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
+                f" directory containing a {config_filename} file"
+            )
+
+        if is_local:
+            logger.info(f"loading configuration file {resolved_config_file}")
+        else:
+            logger.info(
+                f"loading configuration file {config_file} from cache at {resolved_config_file}"
+            )
+
+        return config_dict, kwargs
+
+    def to_dict(self) -> dict[str, Any]:
+        """
+        Serializes this instance to a Python dictionary.
+
+        Returns:
+            `dict[str, Any]`: Dictionary of all the attributes that make up this instance.
+        """
+        output = copy.deepcopy(self.__dict__)
+        output[self._type_key] = self.__class__.__name__
+        output.pop("_valid_kwargs_names", None)
+        for key in self._excluded_dict_keys:
+            if key in output:
+                del output[key]
+        return output
+
+    @classmethod
+    def from_json_file(cls, json_file: str | os.PathLike):
+        """
+        Instantiates a processor from the path to a JSON file of parameters.
+
+        Args:
+            json_file (`str` or `os.PathLike`):
+                Path to the JSON file containing the parameters.
+
+        Returns:
+            A processor of type [`~PreprocessingMixin`]: The processor object instantiated from that JSON file.
+        """
+        with open(json_file, encoding="utf-8") as reader:
+            text = reader.read()
+        config_dict = json.loads(text)
+        return cls(**config_dict)
+
+    def to_json_string(self) -> str:
+        """
+        Serializes this instance to a JSON string.
+
+        Returns:
+            `str`: String containing all the attributes that make up this instance in JSON format.
+        """
+        dictionary = self.to_dict()
+
+        for key, value in dictionary.items():
+            if isinstance(value, np.ndarray):
+                dictionary[key] = value.tolist()
+
+        return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"
+
+    def to_json_file(self, json_file_path: str | os.PathLike):
+        """
+        Save this instance to a JSON file.
+
+        Args:
+            json_file_path (`str` or `os.PathLike`):
+                Path to the JSON file in which this instance's parameters will be saved.
+        """
+        with open(json_file_path, "w", encoding="utf-8") as writer:
+            writer.write(self.to_json_string())
+
+    def __repr__(self):
+        return f"{self.__class__.__name__} {self.to_json_string()}"
+
+    @classmethod
+    def register_for_auto_class(cls, auto_class=None):
+        """
+        Register this class with a given auto class.
+
+        Args:
+            auto_class (`str` or `type`, *optional*):
+                The auto class to register this new processor with. Defaults to the subclass's `_auto_class_default`.
+        """
+        if auto_class is None:
+            auto_class = cls._auto_class_default
+
+        if not isinstance(auto_class, str):
+            auto_class = auto_class.__name__
+
+        import transformers.models.auto as auto_module
+
+        if not hasattr(auto_module, auto_class):
+            raise ValueError(f"{auto_class} is not a valid auto class.")
+
+        cls._auto_class = auto_class
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index fb1bd18c6239..cbbdd63d8110 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -408,6 +408,7 @@ class AudioKwargs(TypedDict, total=False):
             - `'np'`: Return NumPy `np.ndarray` objects.
     """
 
+    sample_rate: Annotated[int | None, positive_int()]
     sampling_rate: Annotated[int | None, positive_int()]
     raw_speech: Union["np.ndarray", list[float], list["np.ndarray"], list[list[float]]] | None
     padding: Annotated[bool | str | PaddingStrategy | None, padding_validator()]
@@ -416,6 +417,8 @@ class AudioKwargs(TypedDict, total=False):
     pad_to_multiple_of: Annotated[int | None, positive_int()]
     return_attention_mask: bool | None
     return_tensors: Annotated[str | TensorType | None, tensor_type_validator()]
+    do_normalize: bool | None
+    device: str | None
 
 
 class ProcessingKwargs(TypedDict, total=False):
diff --git a/src/transformers/torch_mel_spectrogram.py b/src/transformers/torch_mel_spectrogram.py
new file mode 100644
index 000000000000..3d48f2b8192a
--- /dev/null
+++ b/src/transformers/torch_mel_spectrogram.py
@@ -0,0 +1,522 @@
+"""PyTorch implementation of mel spectrogram computation."""
+
+import math
+
+import torch
+
+
+# --- Frequency conversion utilities ---
+
+def _hertz_to_mel_scalar(freq: float, mel_scale: str = "htk") -> float:
+    """Convert a single Hz value to mel using Python math (float64)."""
+    if mel_scale == "htk":
+        return 2595.0 * math.log10(1.0 + freq / 700.0)
+    elif mel_scale == "kaldi":
+        return 1127.0 * math.log(1.0 + freq / 700.0)
+    # slaney
+    f_sp = 200.0 / 3
+    min_log_hz = 1000.0
+    min_log_mel = (min_log_hz - 0.0) / f_sp
+    logstep = math.log(6.4) / 27.0
+    if freq >= min_log_hz:
+        return min_log_mel + math.log(freq / min_log_hz) / logstep
+    return (freq - 0.0) / f_sp
+
+
+def hertz_to_mel(freq: torch.Tensor, mel_scale: str = "htk") -> torch.Tensor:
+    if mel_scale == "htk":
+        return 2595.0 * torch.log10(1.0 + freq / 700.0)
+    elif mel_scale == "kaldi":
+        return 1127.0 * torch.log(1.0 + freq / 700.0)
+    # slaney
+    f_sp = 200.0 / 3
+    min_log_hertz = 1000.0
+    min_log_mel = min_log_hertz / f_sp
+    logstep = 27.0 / torch.log(torch.tensor(6.4))
+    mels = freq / f_sp
+    log_region = freq >= min_log_hertz
+    mels[log_region] = min_log_mel + torch.log(freq[log_region] / min_log_hertz) * logstep
+    return mels
+
+
+def mel_to_hertz(mels: torch.Tensor, mel_scale: str = "htk") -> torch.Tensor:
+    if mel_scale == "htk":
+        return 700.0 * (10.0 ** (mels / 2595.0) - 1.0)
+    elif mel_scale == "kaldi":
+        return 700.0 * (torch.exp(mels / 1127.0) - 1.0)
+    # slaney
+    f_sp = 200.0 / 3
+    min_log_hz = 1000.0
+    min_log_mel = (min_log_hz - 0.0) / f_sp
+    logstep = math.log(6.4) / 27.0
+    freq = 0.0 + f_sp * mels
+    log_region = mels >= min_log_mel
+    freq[log_region] = min_log_hz * torch.exp(logstep * (mels[log_region] - min_log_mel))
+    return freq
+
+
+def _create_triangular_filter_bank(
+    fft_freqs: torch.Tensor, filter_freqs: torch.Tensor
+) -> torch.Tensor:
+    filter_diff = filter_freqs[1:] - filter_freqs[:-1]
+    slopes = filter_freqs.unsqueeze(0) - fft_freqs.unsqueeze(1)
+    down_slopes = -slopes[:, :-2] / filter_diff[:-1]
+    up_slopes = slopes[:, 2:] / filter_diff[1:]
+    return torch.clamp(torch.minimum(down_slopes, up_slopes), min=0)
+
+
+def _kaldi_mel_filter_bank(
+    num_frequency_bins: int,
+    num_mel_filters: int,
+    min_frequency: float,
+    max_frequency: float,
+    sampling_rate: int,
+) -> torch.Tensor:
+    """Compute mel filter bank matching kaldi's exact construction.
+
+    Replicates torchaudio.compliance.kaldi.get_mel_banks exactly:
+    - Uses 1127*ln mel scale (not 2595*log10)
+    - Computes mel points via mel_low + i * delta (not torch.linspace)
+    - Uses n_fft/2 FFT bins (excludes Nyquist), then pads with zero column
+
+    Returns:
+        Tensor of shape (num_frequency_bins, num_mel_filters).
+    """
+    n_fft = (num_frequency_bins - 1) * 2
+    num_fft_bins = n_fft // 2  # kaldi excludes Nyquist bin
+    fft_bin_width = sampling_rate / n_fft
+
+    mel_low = 1127.0 * math.log(1.0 + min_frequency / 700.0)
+    mel_high = 1127.0 * math.log(1.0 + max_frequency / 700.0)
+    mel_delta = (mel_high - mel_low) / (num_mel_filters + 1)
+
+    bin_idx = torch.arange(num_mel_filters).unsqueeze(1)
+    left_mel = mel_low + bin_idx * mel_delta
+    center_mel = mel_low + (bin_idx + 1.0) * mel_delta
+    right_mel = mel_low + (bin_idx + 2.0) * mel_delta
+
+    mel = 1127.0 * (1.0 + fft_bin_width * torch.arange(num_fft_bins) / 700.0).log()
+    mel = mel.unsqueeze(0)
+
+    up_slope = (mel - left_mel) / (center_mel - left_mel)
+    down_slope = (right_mel - mel) / (right_mel - center_mel)
+    banks = torch.max(torch.zeros(1), torch.min(up_slope, down_slope))
+
+    # kaldi pads a zero column for the Nyquist bin
+    banks = torch.nn.functional.pad(banks, (0, 1), mode="constant", value=0)
+
+    return banks.T  # (num_frequency_bins, num_mel_filters)
+
+
+def mel_filter_bank_torch(
+    num_frequency_bins: int,
+    num_mel_filters: int,
+    min_frequency: float,
+    max_frequency: float,
+    sampling_rate: int,
+    norm: str | None = None,
+    mel_scale: str = "htk",
+    triangularize_in_mel_space: bool = False,
+    frequency_bin_mode: str = "rfft",
+    computation_dtype: "torch.dtype | None" = None,
+    bands_to_zero: int = 0,
+) -> torch.Tensor:
+    """Compute mel filter bank as a pure PyTorch tensor.
+
+    Matches torchaudio's melscale_fbanks: mel range endpoints are computed in
+    float64 (Python math), then all tensor work is done in the default dtype
+    (float32).
+
+    Args:
+        computation_dtype: If provided, all intermediate tensor operations are
+            performed in this dtype (e.g. ``torch.float64``), and the result is
+            cast back to the default dtype. This is useful to obtain results
+            that are numerically identical to a NumPy (float64) reference
+            implementation.
+        bands_to_zero: Number of lowest frequency bins to zero out before
+            building the filter bank. The zeroed rows are restored (as zeros)
+            in the output. Set to 1 to exclude the DC bin (HTK / LASR style).
+
+    Returns:
+        Tensor of shape (num_frequency_bins, num_mel_filters).
+    """
+    if triangularize_in_mel_space and bands_to_zero == 0:
+        # Kaldi-exact path: matches torchaudio.compliance.kaldi.get_mel_banks.
+        # Kept for backward compatibility with models that rely on this behaviour
+        # (AST, SeamlessM4T, Speech2Text, etc.).
+        return _kaldi_mel_filter_bank(
+            num_frequency_bins, num_mel_filters, min_frequency, max_frequency, sampling_rate,
+        )
+
+    mel_min = _hertz_to_mel_scalar(min_frequency, mel_scale=mel_scale)
+    mel_max = _hertz_to_mel_scalar(max_frequency, mel_scale=mel_scale)
+
+    n_fft = (num_frequency_bins - 1) * 2
+
+    if triangularize_in_mel_space:
+        # Kaldi-style direct slope computation in mel space.
+        # Uses mel_low + i * delta (not linspace) and direct per-band slopes
+        # to match the exact numerical behaviour of kaldi/HTK filter banks.
+        mel_delta = (mel_max - mel_min) / (num_mel_filters + 1)
+        bin_idx = torch.arange(num_mel_filters, dtype=computation_dtype).unsqueeze(1)
+        left_mel = mel_min + bin_idx * mel_delta
+        center_mel = mel_min + (bin_idx + 1.0) * mel_delta
+        right_mel = mel_min + (bin_idx + 2.0) * mel_delta
+
+        fft_bin_width = sampling_rate / n_fft
+        num_fft_bins = num_frequency_bins - bands_to_zero
+        hz_freqs = fft_bin_width * torch.arange(bands_to_zero, num_frequency_bins, dtype=computation_dtype)
+        mel = hertz_to_mel(hz_freqs, mel_scale=mel_scale).unsqueeze(0)
+
+        up_slope = (mel - left_mel) / (center_mel - left_mel)
+        down_slope = (right_mel - mel) / (right_mel - center_mel)
+        mel_filters = torch.max(torch.zeros(1, dtype=computation_dtype), torch.min(up_slope, down_slope))
+
+        # Transpose to (num_fft_bins, num_mel_filters) and restore zeroed bands
+        mel_filters = mel_filters.T
+        if bands_to_zero > 0:
+            mel_filters = torch.nn.functional.pad(mel_filters, (0, 0, bands_to_zero, 0))
+
+        return mel_filters
+
+    mel_freqs = torch.linspace(mel_min, mel_max, num_mel_filters + 2, dtype=computation_dtype)
+
+    filter_freqs = mel_to_hertz(mel_freqs, mel_scale=mel_scale)
+    if frequency_bin_mode == "rfft":
+        fft_freqs = torch.fft.rfftfreq(n=n_fft, d=1.0 / sampling_rate)
+    else:
+        fft_freqs = torch.linspace(0, sampling_rate // 2, num_frequency_bins)
+    if computation_dtype is not None:
+        fft_freqs = fft_freqs.to(computation_dtype)
+
+    mel_filters = _create_triangular_filter_bank(fft_freqs, filter_freqs)
+
+    if norm == "slaney":
+        enorm = 2.0 / (filter_freqs[2 : num_mel_filters + 2] - filter_freqs[:num_mel_filters])
+        mel_filters = mel_filters * enorm.unsqueeze(0)
+
+    if bands_to_zero > 0:
+        mel_filters = torch.nn.functional.pad(mel_filters, (0, 0, bands_to_zero, 0))
+
+    return mel_filters
+
+
+def window_function(window_length, name="hann_window", periodic=True, wkwargs=None):
+    """Create a window tensor using torch window functions."""
+    if wkwargs is None:
+        wkwargs = {}
+    if name in ["hann", "hann_window"]:
+        return torch.hann_window(window_length, periodic=periodic, **wkwargs)
+    elif name in ["hamming", "hamming_window"]:
+        return torch.hamming_window(window_length, periodic=periodic, **wkwargs)
+    elif name == "boxcar":
+        return torch.ones(window_length)
+    elif name == "povey":
+        return torch.hann_window(window_length, periodic=periodic, **wkwargs).pow(0.85)
+    else:
+        raise ValueError(f"Unknown window function '{name}'")
+
+
+# --- Sub-methods ---
+
+def _prepare_window_and_framing(window, win_length, n_fft, needs_manual_framing):
+    if needs_manual_framing and win_length < n_fft:
+        frame_length = win_length
+    else:
+        if win_length < n_fft:
+            left_pad = (n_fft - win_length) // 2
+            right_pad = n_fft - win_length - left_pad
+            window = torch.nn.functional.pad(window, (left_pad, right_pad))
+        frame_length = n_fft
+    return window, frame_length
+
+
+def _apply_frame_processing(frames, *, dither=0.0, preemphasis=None, remove_dc_offset=False):
+    if dither != 0.0:
+        frames = frames + dither * torch.randn_like(frames)
+    if remove_dc_offset:
+        frames = frames - frames.mean(dim=-1, keepdim=True)
+    if preemphasis is not None:
+        frames = torch.cat([
+            frames[..., :1] * (1 - preemphasis),
+            frames[..., 1:] - preemphasis * frames[..., :-1],
+        ], dim=-1)
+    return frames
+
+
+def _apply_mel_scale(
+    spectrogram: torch.Tensor,
+    mel_filters: torch.Tensor,
+    mel_floor: float = 1e-10,
+) -> torch.Tensor:
+    """Apply mel filterbank to a spectrogram.
+
+    Args:
+        spectrogram: Power spectrogram of shape (..., freq, time).
+        mel_filters: Mel filterbank of shape (freq, n_mels).
+        mel_floor: Minimum value for clamping.
+
+    Returns:
+        Mel spectrogram of shape (..., n_mels, time).
+    """
+    # (..., time, freq) @ (freq, n_mels) -> (..., time, n_mels) -> (..., n_mels, time)
+    mel_spec = torch.matmul(spectrogram.transpose(-2, -1), mel_filters).transpose(-2, -1)
+    return torch.clamp(mel_spec, min=mel_floor)
+
+
+def _torch_stft(
+    waveform, window, frame_length, hop_length, fft_length,
+    normalized, center, pad_mode,
+):
+    """Fast path using torch.stft. Returns complex STFT of shape (batch, freq, time)."""
+    stft_out = torch.stft(
+        waveform,
+        n_fft=fft_length,
+        hop_length=hop_length,
+        win_length=frame_length,
+        window=window,
+        center=center,
+        pad_mode=pad_mode,
+        normalized=False,
+        return_complex=True,
+    )
+    if normalized:
+        stft_out = stft_out / window.pow(2.0).sum().sqrt()
+    return stft_out
+
+
+def _manual_stft(
+    waveform, window, frame_length, hop_length, fft_length,
+    num_frequency_bins, power, normalized, center, pad_mode,
+    apply_frame_processing=None,
+):
+    """Manual framing STFT for kaldi-specific features. Returns power spectrogram of shape (batch, freq, time)."""
+    if center:
+        waveform = torch.nn.functional.pad(
+            waveform, (frame_length // 2, frame_length // 2), mode=pad_mode
+        )
+
+    # Extract all frames at once: (batch, num_frames, frame_length)
+    frames = waveform.unfold(-1, frame_length, hop_length)
+
+    if apply_frame_processing is not None:
+        frames = apply_frame_processing(frames)
+
+    frames = frames * window
+
+    # Zero-pad frames to fft_length if frame_length < fft_length (kaldi left-aligns in FFT buffer)
+    if frame_length < fft_length:
+        frames = torch.nn.functional.pad(frames, (0, fft_length - frame_length))
+
+    # Batched FFT: (batch, num_frames, fft_length) -> (batch, num_frames, num_frequency_bins)
+    spec = torch.fft.rfft(frames, n=fft_length)
+
+    if normalized:
+        spec = spec / window.pow(2.0).sum().sqrt()
+
+    spec = spec.abs() ** power
+
+    # (batch, num_frames, freq) -> (batch, freq, num_frames)
+    return spec.transpose(-2, -1)
+
+
+# --- Main function ---
+
+def mel_spectrogram(
+    waveform: torch.Tensor,
+    sampling_rate: int,
+    *,
+    n_fft: int = 400,
+    win_length: int | None = None,
+    hop_length: int | None = None,
+    window_fn: str = "hann_window",
+    wkwargs: dict | None = None,
+    power: float = 2.0,
+    center: bool = True,
+    pad_mode: str = "reflect",
+    normalized: bool = False,
+    periodic: bool = True,
+    # mel scale kwargs
+    n_mels: int = 128,
+    f_min: float = 0.0,
+    f_max: float | None = None,
+    mel_scale: str = "htk",
+    norm: str | None = None,
+    triangularize_in_mel_space: bool = False,
+    # kaldi-specific kwargs
+    dither: float = 0.0,
+    preemphasis: float | None = None,
+    remove_dc_offset: bool = False,
+    mel_floor: float = 1e-10,
+) -> torch.Tensor:
+    """Compute mel spectrogram using PyTorch.
+
+    Args:
+        waveform: Input waveform of shape (..., time).
+        sampling_rate: Sample rate in Hz.
+
+    Returns:
+        Mel spectrogram of shape (..., n_mels, time).
+    """
+    if f_max is None:
+        f_max = sampling_rate / 2.0
+
+    # --- STFT ---
+    if win_length is None:
+        win_length = n_fft
+    if hop_length is None:
+        hop_length = win_length // 2
+    device = waveform.device
+    dtype = waveform.dtype
+
+    needs_manual_framing = (dither != 0.0) or (preemphasis is not None) or remove_dc_offset
+
+    window_wkwargs = {**(wkwargs or {}), "dtype": dtype}
+    window = window_function(win_length, name=window_fn, periodic=periodic, wkwargs=window_wkwargs)
+    window = window.to(device=device)
+    window, frame_length = _prepare_window_and_framing(window, win_length, n_fft, needs_manual_framing)
+
+    is_1d = waveform.ndim == 1
+    if is_1d:
+        waveform = waveform.unsqueeze(0)
+    leading_shape = waveform.shape[:-1]
+    waveform = waveform.reshape(-1, waveform.shape[-1])
+    if needs_manual_framing:
+        frame_proc = lambda f: _apply_frame_processing(
+            f, dither=dither, preemphasis=preemphasis, remove_dc_offset=remove_dc_offset,
+        )
+        spectrogram = _manual_stft(
+            waveform, window, frame_length, hop_length, n_fft,
+            n_fft // 2 + 1, power, normalized, center, pad_mode,
+            apply_frame_processing=frame_proc,
+        )
+    else:
+        spectrogram = _torch_stft(
+            waveform, window, frame_length, hop_length, n_fft,
+            power, normalized, center, pad_mode,
+        )
+
+    spectrogram = spectrogram.reshape(*leading_shape, spectrogram.shape[-2], spectrogram.shape[-1])
+    if is_1d:
+        spectrogram = spectrogram.squeeze(0)
+    spectrogram = spectrogram.float()
+
+    num_frequency_bins = spectrogram.shape[-2]
+    mel_filters = mel_filter_bank_torch(
+        num_frequency_bins, n_mels, f_min, f_max, sampling_rate,
+        norm=norm, mel_scale=mel_scale,
+        triangularize_in_mel_space=triangularize_in_mel_space,
+    ).to(spectrogram.device)
+
+    return _apply_mel_scale(spectrogram, mel_filters, mel_floor=mel_floor)
+
+
+class MelSpectrogram(torch.nn.Module):
+    """Cached mel spectrogram transform — precomputes window and mel filterbank.
+
+    Same API and exact same results as the functional ``mel_spectrogram``, but
+    avoids recomputing the window and mel filterbank on every call.
+
+    Usage::
+
+        transform = MelSpectrogram(sampling_rate=16000, n_fft=1024, n_mels=80)
+        transform = transform.cuda()          # move buffers to GPU once
+        mel = transform(waveform)             # fast repeated calls
+    """
+
+    def __init__(
+        self,
+        sampling_rate: int,
+        *,
+        n_fft: int = 400,
+        win_length: int | None = None,
+        hop_length: int | None = None,
+        window_fn: str = "hann_window",
+        wkwargs: dict | None = None,
+        power: float = 2.0,
+        center: bool = True,
+        pad_mode: str = "reflect",
+        normalized: bool = False,
+        periodic: bool = True,
+        n_mels: int = 128,
+        f_min: float = 0.0,
+        f_max: float | None = None,
+        mel_scale: str = "htk",
+        norm: str | None = None,
+        triangularize_in_mel_space: bool = False,
+        dither: float = 0.0,
+        preemphasis: float | None = None,
+        remove_dc_offset: bool = False,
+        mel_floor: float = 1e-10,
+    ):
+        super().__init__()
+        self.sampling_rate = sampling_rate
+        self.n_fft = n_fft
+        self.win_length = win_length if win_length is not None else n_fft
+        self.hop_length = hop_length if hop_length is not None else self.win_length // 2
+        self.power = power
+        self.center = center
+        self.pad_mode = pad_mode
+        self.normalized = normalized
+        self.n_mels = n_mels
+        self.f_min = f_min
+        self.f_max = f_max if f_max is not None else sampling_rate / 2.0
+        self.mel_floor = mel_floor
+        self.dither = dither
+        self.preemphasis = preemphasis
+        self.remove_dc_offset = remove_dc_offset
+
+        self._needs_manual_framing = (dither != 0.0) or (preemphasis is not None) or remove_dc_offset
+
+        # Build window
+        window = window_function(self.win_length, name=window_fn, periodic=periodic, wkwargs=wkwargs)
+        window, self._frame_length = _prepare_window_and_framing(window, self.win_length, n_fft, self._needs_manual_framing)
+        self.register_buffer("window", window)
+
+        # Build mel filterbank
+        num_frequency_bins = n_fft // 2 + 1
+        mel_fb = mel_filter_bank_torch(
+            num_frequency_bins, n_mels, self.f_min, self.f_max, sampling_rate,
+            norm=norm, mel_scale=mel_scale,
+            triangularize_in_mel_space=triangularize_in_mel_space,
+        )
+        self.register_buffer("mel_filters", mel_fb)
+
+    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
+        """Compute mel spectrogram.
+
+        Args:
+            waveform: Input of shape (..., time).
+
+        Returns:
+            Mel spectrogram of shape (..., n_mels, time).
+        """
+        is_1d = waveform.ndim == 1
+        if is_1d:
+            waveform = waveform.unsqueeze(0)
+
+        leading_shape = waveform.shape[:-1]
+        waveform = waveform.reshape(-1, waveform.shape[-1])
+
+        if self._needs_manual_framing:
+            frame_proc = lambda f: _apply_frame_processing(
+                f, dither=self.dither, preemphasis=self.preemphasis, remove_dc_offset=self.remove_dc_offset,
+            )
+            spec = _manual_stft(
+                waveform, self.window, self._frame_length, self.hop_length,
+                self.n_fft, self.n_fft // 2 + 1, self.power, self.normalized,
+                self.center, self.pad_mode,
+                apply_frame_processing=frame_proc,
+            )
+        else:
+            spec = _torch_stft(
+                waveform, self.window, self._frame_length, self.hop_length,
+                self.n_fft, self.power, self.normalized, self.center, self.pad_mode,
+            )
+
+        spec = spec.reshape(*leading_shape, spec.shape[-2], spec.shape[-1])
+        if is_1d:
+            spec = spec.squeeze(0)
+        spec = spec.float()
+
+        return _apply_mel_scale(spec, self.mel_filters, mel_floor=self.mel_floor)
diff --git a/src/transformers/utils/deprecation.py b/src/transformers/utils/deprecation.py
index db0e67325d78..9b44e549df1b 100644
--- a/src/transformers/utils/deprecation.py
+++ b/src/transformers/utils/deprecation.py
@@ -33,6 +33,41 @@ class Action(ExplicitEnum):
     RAISE = "raise"
 
 
+def deprecated_feature_extractor(audio_processor_class, old_class_name, version="5.5"):
+    """Create a deprecated FeatureExtractor alias for an AudioProcessor.
+
+    Uses dynamic class creation to reduce boilerplate across ~20 models.
+    """
+
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            f"`{old_class_name}` is deprecated and will be removed in v{version}. "
+            f"Use `{audio_processor_class.__name__}` instead.",
+            FutureWarning,
+            stacklevel=2,
+        )
+        super(type(self), self).__init__(*args, **kwargs)
+
+    def __init_subclass__(cls, **kwargs):
+        warnings.warn(
+            f"`{old_class_name}` is deprecated and will be removed in v{version}. "
+            f"Use `{audio_processor_class.__name__}` instead.",
+            FutureWarning,
+        )
+        super(type(cls), cls).__init_subclass__(**kwargs)
+
+    return type(
+        old_class_name,
+        (audio_processor_class,),
+        {
+            "__init__": __init__,
+            "__init_subclass__": __init_subclass__,
+            "__module__": audio_processor_class.__module__,
+            "__doc__": f"Deprecated. Use {audio_processor_class.__name__} instead.",
+        },
+    )
+
+
 def deprecate_kwarg(
     old_name: str,
     version: str,