From ccade7f370854dc07d6643a6eb52c201ba112661 Mon Sep 17 00:00:00 2001
From: Jonghwan Hyeon <jonghwanhyeon93@gmail.com>
Date: Tue, 21 Apr 2026 20:23:18 +0900
Subject: [PATCH] fix: apply channel averaging correctly in audio feature
 extractors

---
 .../models/cohere_asr/feature_extraction_cohere_asr.py      | 6 +++---
 src/transformers/models/lasr/feature_extraction_lasr.py     | 6 +++---
 .../models/parakeet/feature_extraction_parakeet.py          | 6 +++---
 .../phi4_multimodal/feature_extraction_phi4_multimodal.py   | 6 +++---
 .../voxtral_realtime/feature_extraction_voxtral_realtime.py | 6 +++---
 5 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/cohere_asr/feature_extraction_cohere_asr.py b/src/transformers/models/cohere_asr/feature_extraction_cohere_asr.py
index 1192be10606d..42f4bf3117da 100644
--- a/src/transformers/models/cohere_asr/feature_extraction_cohere_asr.py
+++ b/src/transformers/models/cohere_asr/feature_extraction_cohere_asr.py
@@ -284,17 +284,17 @@ def __call__(
                 f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                 "We will take the mean of the channels to convert to mono."
             )
-            raw_speech = raw_speech.mean(-1)
+            raw_speech = raw_speech.mean(1)
 
         is_batched_sequence = isinstance(raw_speech, (list, tuple))
         if is_batched_sequence:
-            for speech in raw_speech:
+            for index, speech in enumerate(raw_speech):
                 if len(speech.shape) > 1:
                     logger.warning(
                         f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                         "We will take the mean of the channels to convert to mono."
                     )
-                    speech = speech.mean(-1)
+                    raw_speech[index] = speech.mean(0)
 
         if is_batched_torch or is_batched_sequence:
             raw_speech = [speech.to(torch.float32) for speech in raw_speech]
diff --git a/src/transformers/models/lasr/feature_extraction_lasr.py b/src/transformers/models/lasr/feature_extraction_lasr.py
index 7cf1822ee40d..26cacd39b09a 100644
--- a/src/transformers/models/lasr/feature_extraction_lasr.py
+++ b/src/transformers/models/lasr/feature_extraction_lasr.py
@@ -232,17 +232,17 @@ def __call__(
                 f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                 "We will take the mean of the channels to convert to mono."
             )
-            raw_speech = raw_speech.mean(-1)
+            raw_speech = raw_speech.mean(1)
 
         is_batched_sequence = isinstance(raw_speech, (list, tuple))
         if is_batched_sequence:
-            for speech in raw_speech:
+            for index, speech in enumerate(raw_speech):
                 if len(speech.shape) > 1:
                     logger.warning(
                         f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                         "We will take the mean of the channels to convert to mono."
                     )
-                    speech = speech.mean(-1)
+                    raw_speech[index] = speech.mean(0)
 
         if is_batched_torch or is_batched_sequence:
             raw_speech = [speech[:, None].to(torch.float32) for speech in raw_speech]
diff --git a/src/transformers/models/parakeet/feature_extraction_parakeet.py b/src/transformers/models/parakeet/feature_extraction_parakeet.py
index c745d02c9629..95289cc00d99 100644
--- a/src/transformers/models/parakeet/feature_extraction_parakeet.py
+++ b/src/transformers/models/parakeet/feature_extraction_parakeet.py
@@ -217,17 +217,17 @@ def __call__(
                 f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                 "We will take the mean of the channels to convert to mono."
             )
-            raw_speech = raw_speech.mean(-1)
+            raw_speech = raw_speech.mean(1)
 
         is_batched_sequence = isinstance(raw_speech, (list, tuple))
         if is_batched_sequence:
-            for speech in raw_speech:
+            for index, speech in enumerate(raw_speech):
                 if len(speech.shape) > 1:
                     logger.warning(
                         f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                         "We will take the mean of the channels to convert to mono."
                     )
-                    speech = speech.mean(-1)
+                    raw_speech[index] = speech.mean(0)
 
         if is_batched_torch or is_batched_sequence:
             raw_speech = [speech[:, None].to(torch.float32) for speech in raw_speech]
diff --git a/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py
index 9ce98251e50e..3c3c1723a35a 100644
--- a/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py
+++ b/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py
@@ -145,17 +145,17 @@ def __call__(
                 f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                 "We will take the mean of the channels to convert to mono."
             )
-            raw_speech = raw_speech.mean(-1)
+            raw_speech = raw_speech.mean(1)
 
         is_batched_sequence = isinstance(raw_speech, (list, tuple))
         if is_batched_sequence:
-            for speech in raw_speech:
+            for index, speech in enumerate(raw_speech):
                 if len(speech.shape) > 1:
                     logger.warning(
                         f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                         "We will take the mean of the channels to convert to mono."
                     )
-                    speech = speech.mean(-1)
+                    raw_speech[index] = speech.mean(0)
 
         if is_batched_torch or is_batched_sequence:
             raw_speech = [speech[:, None].to(torch.float32) for speech in raw_speech]
diff --git a/src/transformers/models/voxtral_realtime/feature_extraction_voxtral_realtime.py b/src/transformers/models/voxtral_realtime/feature_extraction_voxtral_realtime.py
index 58355f3c0d7c..f13006f6b198 100644
--- a/src/transformers/models/voxtral_realtime/feature_extraction_voxtral_realtime.py
+++ b/src/transformers/models/voxtral_realtime/feature_extraction_voxtral_realtime.py
@@ -203,17 +203,17 @@ def __call__(
                 f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                 "We will take the mean of the channels to convert to mono."
             )
-            raw_speech = raw_speech.mean(-1)
+            raw_speech = raw_speech.mean(1)
 
         is_batched_sequence = isinstance(raw_speech, (list, tuple))
         if is_batched_sequence:
-            for speech in raw_speech:
+            for index, speech in enumerate(raw_speech):
                 if len(speech.shape) > 1:
                     logger.warning(
                         f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                         "We will take the mean of the channels to convert to mono."
                     )
-                    speech = speech.mean(-1)
+                    raw_speech[index] = speech.mean(0)
 
         if is_batched_torch or is_batched_sequence:
             raw_speech = [speech[:, None].to(torch.float32) for speech in raw_speech]