diff --git a/src/transformers/audio_utils.py b/src/transformers/audio_utils.py index 694ab5f50df1..e8684ab70218 100644 --- a/src/transformers/audio_utils.py +++ b/src/transformers/audio_utils.py @@ -1081,146 +1081,3 @@ def amplitude_to_db_batch( spectrogram = np.clip(spectrogram, a_min=max_values - db_range, a_max=None) return spectrogram - - -### deprecated functions below this line ### - - -def get_mel_filter_banks( - nb_frequency_bins: int, - nb_mel_filters: int, - frequency_min: float, - frequency_max: float, - sample_rate: int, - norm: Optional[str] = None, - mel_scale: str = "htk", -) -> np.array: - warnings.warn( - "The function `get_mel_filter_banks` is deprecated and will be removed in version 4.31.0 of Transformers", - FutureWarning, - ) - return mel_filter_bank( - num_frequency_bins=nb_frequency_bins, - num_mel_filters=nb_mel_filters, - min_frequency=frequency_min, - max_frequency=frequency_max, - sampling_rate=sample_rate, - norm=norm, - mel_scale=mel_scale, - ) - - -def fram_wave(waveform: np.array, hop_length: int = 160, fft_window_size: int = 400, center: bool = True): - """ - In order to compute the short time fourier transform, the waveform needs to be split in overlapping windowed - segments called `frames`. - - The window length (window_length) defines how much of the signal is contained in each frame, while the hop length - defines the step between the beginning of each new frame. - - - Args: - waveform (`np.array` of shape `(sample_length,)`): - The raw waveform which will be split into smaller chunks. - hop_length (`int`, *optional*, defaults to 160): - Step between each window of the waveform. - fft_window_size (`int`, *optional*, defaults to 400): - Defines the size of the window. - center (`bool`, defaults to `True`): - Whether or not to center each frame around the middle of the frame. Centering is done by reflecting the - waveform on the left and on the right. - - Return: - framed_waveform (`np.array` of shape `(waveform.shape // hop_length , fft_window_size)`): - The framed waveforms that can be fed to `np.fft`. - """ - warnings.warn( - "The function `fram_wave` is deprecated and will be removed in version 4.31.0 of Transformers", - FutureWarning, - ) - frames = [] - for i in range(0, waveform.shape[0] + 1, hop_length): - if center: - half_window = (fft_window_size - 1) // 2 + 1 - start = i - half_window if i > half_window else 0 - end = i + half_window if i < waveform.shape[0] - half_window else waveform.shape[0] - frame = waveform[start:end] - if start == 0: - padd_width = (-i + half_window, 0) - frame = np.pad(frame, pad_width=padd_width, mode="reflect") - - elif end == waveform.shape[0]: - padd_width = (0, (i - waveform.shape[0] + half_window)) - frame = np.pad(frame, pad_width=padd_width, mode="reflect") - - else: - frame = waveform[i : i + fft_window_size] - frame_width = frame.shape[0] - if frame_width < waveform.shape[0]: - frame = np.pad(frame, pad_width=(0, fft_window_size - frame_width), mode="constant", constant_values=0) - frames.append(frame) - - frames = np.stack(frames, 0) - return frames - - -def stft(frames: np.array, windowing_function: np.array, fft_window_size: Optional[int] = None): - """ - Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same results - as `torch.stft`. - - Args: - frames (`np.array` of dimension `(num_frames, fft_window_size)`): - A framed audio signal obtained using `audio_utils.fram_wav`. - windowing_function (`np.array` of dimension `(nb_frequency_bins, nb_mel_filters)`: - A array representing the function that will be used to reduces the amplitude of the discontinuities at the - boundaries of each frame when computing the STFT. Each frame will be multiplied by the windowing_function. - For more information on the discontinuities, called *Spectral leakage*, refer to [this - tutorial]https://download.ni.com/evaluation/pxi/Understanding%20FFTs%20and%20Windowing.pdf - fft_window_size (`int`, *optional*): - Size of the window om which the Fourier transform is applied. This controls the frequency resolution of the - spectrogram. 400 means that the fourier transform is computed on windows of 400 samples. The number of - frequency bins (`nb_frequency_bins`) used to divide the window into equal strips is equal to - `(1+fft_window_size)//2`. An increase of the fft_window_size slows the calculus time proportionally. - - Example: - - ```python - >>> from transformers.audio_utils import stft, fram_wave - >>> import numpy as np - - >>> audio = np.random.rand(50) - >>> fft_window_size = 10 - >>> hop_length = 2 - >>> framed_audio = fram_wave(audio, hop_length, fft_window_size) - >>> spectrogram = stft(framed_audio, np.hanning(fft_window_size + 1)) - ``` - - Returns: - spectrogram (`np.ndarray`): - A spectrogram of shape `(num_frames, nb_frequency_bins)` obtained using the STFT algorithm - """ - warnings.warn( - "The function `stft` is deprecated and will be removed in version 4.31.0 of Transformers", - FutureWarning, - ) - frame_size = frames.shape[1] - - if fft_window_size is None: - fft_window_size = frame_size - - if fft_window_size < frame_size: - raise ValueError("FFT size must greater or equal the frame size") - # number of FFT bins to store - nb_frequency_bins = (fft_window_size >> 1) + 1 - - spectrogram = np.empty((len(frames), nb_frequency_bins), dtype=np.complex64) - fft_signal = np.zeros(fft_window_size) - - for f, frame in enumerate(frames): - if windowing_function is not None: - np.multiply(frame, windowing_function, out=fft_signal[:frame_size]) - else: - fft_signal[:frame_size] = frame - spectrogram[f] = np.fft.fft(fft_signal, axis=0)[:nb_frequency_bins] - return spectrogram.T