From ccade7f370854dc07d6643a6eb52c201ba112661 Mon Sep 17 00:00:00 2001 From: Jonghwan Hyeon Date: Tue, 21 Apr 2026 20:23:18 +0900 Subject: [PATCH] fix: apply channel averaging correctly in audio feature extractors --- .../models/cohere_asr/feature_extraction_cohere_asr.py | 6 +++--- src/transformers/models/lasr/feature_extraction_lasr.py | 6 +++--- .../models/parakeet/feature_extraction_parakeet.py | 6 +++--- .../phi4_multimodal/feature_extraction_phi4_multimodal.py | 6 +++--- .../voxtral_realtime/feature_extraction_voxtral_realtime.py | 6 +++--- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/transformers/models/cohere_asr/feature_extraction_cohere_asr.py b/src/transformers/models/cohere_asr/feature_extraction_cohere_asr.py index 1192be10606d..42f4bf3117da 100644 --- a/src/transformers/models/cohere_asr/feature_extraction_cohere_asr.py +++ b/src/transformers/models/cohere_asr/feature_extraction_cohere_asr.py @@ -284,17 +284,17 @@ def __call__( f"Only mono-channel audio is supported for input to {self.__class__.__name__}. " "We will take the mean of the channels to convert to mono." ) - raw_speech = raw_speech.mean(-1) + raw_speech = raw_speech.mean(1) is_batched_sequence = isinstance(raw_speech, (list, tuple)) if is_batched_sequence: - for speech in raw_speech: + for index, speech in enumerate(raw_speech): if len(speech.shape) > 1: logger.warning( f"Only mono-channel audio is supported for input to {self.__class__.__name__}. " "We will take the mean of the channels to convert to mono." ) - speech = speech.mean(-1) + raw_speech[index] = speech.mean(0) if is_batched_torch or is_batched_sequence: raw_speech = [speech.to(torch.float32) for speech in raw_speech] diff --git a/src/transformers/models/lasr/feature_extraction_lasr.py b/src/transformers/models/lasr/feature_extraction_lasr.py index 7cf1822ee40d..26cacd39b09a 100644 --- a/src/transformers/models/lasr/feature_extraction_lasr.py +++ b/src/transformers/models/lasr/feature_extraction_lasr.py @@ -232,17 +232,17 @@ def __call__( f"Only mono-channel audio is supported for input to {self.__class__.__name__}. " "We will take the mean of the channels to convert to mono." ) - raw_speech = raw_speech.mean(-1) + raw_speech = raw_speech.mean(1) is_batched_sequence = isinstance(raw_speech, (list, tuple)) if is_batched_sequence: - for speech in raw_speech: + for index, speech in enumerate(raw_speech): if len(speech.shape) > 1: logger.warning( f"Only mono-channel audio is supported for input to {self.__class__.__name__}. " "We will take the mean of the channels to convert to mono." ) - speech = speech.mean(-1) + raw_speech[index] = speech.mean(0) if is_batched_torch or is_batched_sequence: raw_speech = [speech[:, None].to(torch.float32) for speech in raw_speech] diff --git a/src/transformers/models/parakeet/feature_extraction_parakeet.py b/src/transformers/models/parakeet/feature_extraction_parakeet.py index c745d02c9629..95289cc00d99 100644 --- a/src/transformers/models/parakeet/feature_extraction_parakeet.py +++ b/src/transformers/models/parakeet/feature_extraction_parakeet.py @@ -217,17 +217,17 @@ def __call__( f"Only mono-channel audio is supported for input to {self.__class__.__name__}. " "We will take the mean of the channels to convert to mono." ) - raw_speech = raw_speech.mean(-1) + raw_speech = raw_speech.mean(1) is_batched_sequence = isinstance(raw_speech, (list, tuple)) if is_batched_sequence: - for speech in raw_speech: + for index, speech in enumerate(raw_speech): if len(speech.shape) > 1: logger.warning( f"Only mono-channel audio is supported for input to {self.__class__.__name__}. " "We will take the mean of the channels to convert to mono." ) - speech = speech.mean(-1) + raw_speech[index] = speech.mean(0) if is_batched_torch or is_batched_sequence: raw_speech = [speech[:, None].to(torch.float32) for speech in raw_speech] diff --git a/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py index 9ce98251e50e..3c3c1723a35a 100644 --- a/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py @@ -145,17 +145,17 @@ def __call__( f"Only mono-channel audio is supported for input to {self.__class__.__name__}. " "We will take the mean of the channels to convert to mono." ) - raw_speech = raw_speech.mean(-1) + raw_speech = raw_speech.mean(1) is_batched_sequence = isinstance(raw_speech, (list, tuple)) if is_batched_sequence: - for speech in raw_speech: + for index, speech in enumerate(raw_speech): if len(speech.shape) > 1: logger.warning( f"Only mono-channel audio is supported for input to {self.__class__.__name__}. " "We will take the mean of the channels to convert to mono." ) - speech = speech.mean(-1) + raw_speech[index] = speech.mean(0) if is_batched_torch or is_batched_sequence: raw_speech = [speech[:, None].to(torch.float32) for speech in raw_speech] diff --git a/src/transformers/models/voxtral_realtime/feature_extraction_voxtral_realtime.py b/src/transformers/models/voxtral_realtime/feature_extraction_voxtral_realtime.py index 58355f3c0d7c..f13006f6b198 100644 --- a/src/transformers/models/voxtral_realtime/feature_extraction_voxtral_realtime.py +++ b/src/transformers/models/voxtral_realtime/feature_extraction_voxtral_realtime.py @@ -203,17 +203,17 @@ def __call__( f"Only mono-channel audio is supported for input to {self.__class__.__name__}. " "We will take the mean of the channels to convert to mono." ) - raw_speech = raw_speech.mean(-1) + raw_speech = raw_speech.mean(1) is_batched_sequence = isinstance(raw_speech, (list, tuple)) if is_batched_sequence: - for speech in raw_speech: + for index, speech in enumerate(raw_speech): if len(speech.shape) > 1: logger.warning( f"Only mono-channel audio is supported for input to {self.__class__.__name__}. " "We will take the mean of the channels to convert to mono." ) - speech = speech.mean(-1) + raw_speech[index] = speech.mean(0) if is_batched_torch or is_batched_sequence: raw_speech = [speech[:, None].to(torch.float32) for speech in raw_speech]