From e20a71fb01db04a039901997a6f0f9acef8d307b Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Wed, 15 Apr 2026 13:19:50 +0200 Subject: [PATCH 1/8] Make Qwen2_5OmniProcessor warning a lot less noisy via warning_once --- .../models/qwen2_5_omni/processing_qwen2_5_omni.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py index dcc98856ddc2..c37b64e4e040 100644 --- a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py @@ -16,7 +16,6 @@ Processor class for Qwen2.5Omni. """ -import logging import re import numpy as np @@ -25,10 +24,13 @@ from ...image_utils import ImageInput from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs from ...tokenization_utils_base import AudioInput, PreTokenizedInput, TextInput -from ...utils import auto_docstring +from ...utils import auto_docstring, logging from ...video_utils import VideoInput +logger = logging.get_logger(__name__) + + # Redefine kwargs for videos because Qwen-Omni uses some kwargs for processing omni # and does not use them in video processor class class Qwen2_5_OmniVideosKwargs(VideosKwargs, total=False): @@ -313,7 +315,7 @@ def apply_chat_template(self, conversations, chat_template=None, **kwargs): or conversation[0]["content"][0]["text"] != "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech." ): - logging.warning( + logger.warning_once( "System prompt modified, audio output may not work as expected. " + "Audio output mode only works when using default system prompt 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'" ) From 4e0de1b623b870d6b079f99ea487aac68e3b85cb Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Wed, 15 Apr 2026 13:25:25 +0200 Subject: [PATCH 2/8] Bonus request: allow fully disabling the warning via config --- .../qwen2_5_omni/processing_qwen2_5_omni.py | 30 ++++++++++++------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py index c37b64e4e040..a08d7cf64fbc 100644 --- a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py @@ -107,7 +107,13 @@ class Qwen2_5OmniProcessorKwargs(ProcessingKwargs, total=False): @auto_docstring class Qwen2_5OmniProcessor(ProcessorMixin): def __init__( - self, image_processor=None, video_processor=None, feature_extractor=None, tokenizer=None, chat_template=None + self, + image_processor=None, + video_processor=None, + feature_extractor=None, + tokenizer=None, + chat_template=None, + check_audio_system_prompt: bool = True, ): super().__init__(image_processor, video_processor, feature_extractor, tokenizer, chat_template=chat_template) self.image_token = self.tokenizer.image_token @@ -117,6 +123,7 @@ def __init__( self.vision_eos_token = self.tokenizer.vision_eos_token self.audio_bos_token = self.tokenizer.audio_bos_token self.audio_eos_token = self.tokenizer.audio_eos_token + self.check_audio_system_prompt = check_audio_system_prompt @auto_docstring def __call__( @@ -309,16 +316,17 @@ def apply_chat_template(self, conversations, chat_template=None, **kwargs): conversations = [conversations] is_batched = True - for conversation in conversations: - if ( - conversation[0]["role"] != "system" - or conversation[0]["content"][0]["text"] - != "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech." - ): - logger.warning_once( - "System prompt modified, audio output may not work as expected. " - + "Audio output mode only works when using default system prompt 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'" - ) + if self.check_audio_system_prompt: + for conversation in conversations: + if ( + conversation[0]["role"] != "system" + or conversation[0]["content"][0]["text"] + != "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech." + ): + logger.warning_once( + "System prompt modified, audio output may not work as expected. " + + "Audio output mode only works when using default system prompt 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'" + ) if is_batched: conversations = conversations[0] From 2ad8a4e3aecb424703288e0067ac353cd7cea463 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Wed, 15 Apr 2026 13:37:41 +0200 Subject: [PATCH 3/8] Add test to show that warning can be quietened with this --- .../test_processing_qwen2_5_omni.py | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/models/qwen2_5_omni/test_processing_qwen2_5_omni.py b/tests/models/qwen2_5_omni/test_processing_qwen2_5_omni.py index ce83b13a3f19..f3e6c59d4957 100644 --- a/tests/models/qwen2_5_omni/test_processing_qwen2_5_omni.py +++ b/tests/models/qwen2_5_omni/test_processing_qwen2_5_omni.py @@ -13,6 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. import inspect +import json +import os +import tempfile import unittest import numpy as np @@ -353,3 +356,29 @@ def test_chat_template_audio_from_video(self): # Qwen pixel values are flattened, verify length matches video_grid_thw expected_video_tokens = sum(thw[0] * thw[1] * thw[2] for thw in out_dict["video_grid_thw"]) self.assertEqual(len(out_dict[self.videos_input_name]), expected_video_tokens) # 1 video in the conversation + + def test_check_audio_system_prompt_round_trip(self): + processor = self.get_processor() + self.assertTrue(processor.check_audio_system_prompt) + + processor.check_audio_system_prompt = False + with tempfile.TemporaryDirectory() as tmpdir: + processor.save_pretrained(tmpdir) + + with open(os.path.join(tmpdir, "processor_config.json")) as f: + saved_config = json.load(f) + self.assertFalse(saved_config["check_audio_system_prompt"]) + + reloaded = self.processor_class.from_pretrained(tmpdir) + self.assertFalse(reloaded.check_audio_system_prompt) + + # With the flag disabled, a non-default system prompt must not produce a warning. + messages = [{"role": "user", "content": [{"type": "text", "text": "hi"}]}] + with self.assertNoLogs("transformers.models.qwen2_5_omni.processing_qwen2_5_omni", level="WARNING"): + reloaded.apply_chat_template(messages, tokenize=False) + + # While with the flag enabled, we'll get a warning + reloaded.check_audio_system_prompt = True + messages = [{"role": "user", "content": [{"type": "text", "text": "hi"}]}] + with self.assertLogs("transformers.models.qwen2_5_omni.processing_qwen2_5_omni", level="WARNING"): + reloaded.apply_chat_template(messages, tokenize=False) From e3f8ea939cf9de9144d436b5025af3617dd6e7af Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Wed, 15 Apr 2026 13:53:12 +0200 Subject: [PATCH 4/8] Revert "Make Qwen2_5OmniProcessor warning a lot less noisy via warning_once" This reverts commit e20a71fb01db04a039901997a6f0f9acef8d307b to 2ad8a4e3aecb424703288e0067ac353cd7cea463 --- .../qwen2_5_omni/processing_qwen2_5_omni.py | 36 +++++++------------ .../test_processing_qwen2_5_omni.py | 29 --------------- 2 files changed, 13 insertions(+), 52 deletions(-) diff --git a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py index a08d7cf64fbc..dcc98856ddc2 100644 --- a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py @@ -16,6 +16,7 @@ Processor class for Qwen2.5Omni. """ +import logging import re import numpy as np @@ -24,13 +25,10 @@ from ...image_utils import ImageInput from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs from ...tokenization_utils_base import AudioInput, PreTokenizedInput, TextInput -from ...utils import auto_docstring, logging +from ...utils import auto_docstring from ...video_utils import VideoInput -logger = logging.get_logger(__name__) - - # Redefine kwargs for videos because Qwen-Omni uses some kwargs for processing omni # and does not use them in video processor class class Qwen2_5_OmniVideosKwargs(VideosKwargs, total=False): @@ -107,13 +105,7 @@ class Qwen2_5OmniProcessorKwargs(ProcessingKwargs, total=False): @auto_docstring class Qwen2_5OmniProcessor(ProcessorMixin): def __init__( - self, - image_processor=None, - video_processor=None, - feature_extractor=None, - tokenizer=None, - chat_template=None, - check_audio_system_prompt: bool = True, + self, image_processor=None, video_processor=None, feature_extractor=None, tokenizer=None, chat_template=None ): super().__init__(image_processor, video_processor, feature_extractor, tokenizer, chat_template=chat_template) self.image_token = self.tokenizer.image_token @@ -123,7 +115,6 @@ def __init__( self.vision_eos_token = self.tokenizer.vision_eos_token self.audio_bos_token = self.tokenizer.audio_bos_token self.audio_eos_token = self.tokenizer.audio_eos_token - self.check_audio_system_prompt = check_audio_system_prompt @auto_docstring def __call__( @@ -316,17 +307,16 @@ def apply_chat_template(self, conversations, chat_template=None, **kwargs): conversations = [conversations] is_batched = True - if self.check_audio_system_prompt: - for conversation in conversations: - if ( - conversation[0]["role"] != "system" - or conversation[0]["content"][0]["text"] - != "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech." - ): - logger.warning_once( - "System prompt modified, audio output may not work as expected. " - + "Audio output mode only works when using default system prompt 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'" - ) + for conversation in conversations: + if ( + conversation[0]["role"] != "system" + or conversation[0]["content"][0]["text"] + != "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech." + ): + logging.warning( + "System prompt modified, audio output may not work as expected. " + + "Audio output mode only works when using default system prompt 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'" + ) if is_batched: conversations = conversations[0] diff --git a/tests/models/qwen2_5_omni/test_processing_qwen2_5_omni.py b/tests/models/qwen2_5_omni/test_processing_qwen2_5_omni.py index f3e6c59d4957..ce83b13a3f19 100644 --- a/tests/models/qwen2_5_omni/test_processing_qwen2_5_omni.py +++ b/tests/models/qwen2_5_omni/test_processing_qwen2_5_omni.py @@ -13,9 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import inspect -import json -import os -import tempfile import unittest import numpy as np @@ -356,29 +353,3 @@ def test_chat_template_audio_from_video(self): # Qwen pixel values are flattened, verify length matches video_grid_thw expected_video_tokens = sum(thw[0] * thw[1] * thw[2] for thw in out_dict["video_grid_thw"]) self.assertEqual(len(out_dict[self.videos_input_name]), expected_video_tokens) # 1 video in the conversation - - def test_check_audio_system_prompt_round_trip(self): - processor = self.get_processor() - self.assertTrue(processor.check_audio_system_prompt) - - processor.check_audio_system_prompt = False - with tempfile.TemporaryDirectory() as tmpdir: - processor.save_pretrained(tmpdir) - - with open(os.path.join(tmpdir, "processor_config.json")) as f: - saved_config = json.load(f) - self.assertFalse(saved_config["check_audio_system_prompt"]) - - reloaded = self.processor_class.from_pretrained(tmpdir) - self.assertFalse(reloaded.check_audio_system_prompt) - - # With the flag disabled, a non-default system prompt must not produce a warning. - messages = [{"role": "user", "content": [{"type": "text", "text": "hi"}]}] - with self.assertNoLogs("transformers.models.qwen2_5_omni.processing_qwen2_5_omni", level="WARNING"): - reloaded.apply_chat_template(messages, tokenize=False) - - # While with the flag enabled, we'll get a warning - reloaded.check_audio_system_prompt = True - messages = [{"role": "user", "content": [{"type": "text", "text": "hi"}]}] - with self.assertLogs("transformers.models.qwen2_5_omni.processing_qwen2_5_omni", level="WARNING"): - reloaded.apply_chat_template(messages, tokenize=False) From 1f77f89612dd8e2f1554840214ff0cabcc111d79 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Wed, 15 Apr 2026 13:53:31 +0200 Subject: [PATCH 5/8] Remove system prompt warning --- .../models/qwen2_5_omni/processing_qwen2_5_omni.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py index dcc98856ddc2..927ea350b5ca 100644 --- a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py @@ -306,17 +306,6 @@ def apply_chat_template(self, conversations, chat_template=None, **kwargs): if isinstance(conversations[0], dict): conversations = [conversations] is_batched = True - - for conversation in conversations: - if ( - conversation[0]["role"] != "system" - or conversation[0]["content"][0]["text"] - != "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech." - ): - logging.warning( - "System prompt modified, audio output may not work as expected. " - + "Audio output mode only works when using default system prompt 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'" - ) if is_batched: conversations = conversations[0] From 9e95eedf10061ff4b032cfe6e631f79d1b820afb Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Wed, 15 Apr 2026 13:55:20 +0200 Subject: [PATCH 6/8] Make style --- src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py index 927ea350b5ca..0c1cf90c7849 100644 --- a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py @@ -16,7 +16,6 @@ Processor class for Qwen2.5Omni. """ -import logging import re import numpy as np From e0d7086c06011ad448407d2ae0dba6fa6e8028be Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Wed, 15 Apr 2026 14:16:04 +0200 Subject: [PATCH 7/8] Fully remove apply_chat_template! --- .../models/qwen2_5_omni/processing_qwen2_5_omni.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py index 0c1cf90c7849..5f5b6584862a 100644 --- a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py @@ -300,16 +300,6 @@ def _iter(): return list(_iter()) - def apply_chat_template(self, conversations, chat_template=None, **kwargs): - is_batched = False - if isinstance(conversations[0], dict): - conversations = [conversations] - is_batched = True - if is_batched: - conversations = conversations[0] - - return super().apply_chat_template(conversations, chat_template, **kwargs) - def post_process_image_text_to_text(self, generated_outputs, skip_special_tokens=True, **kwargs): """ Post-process the output of a vlm to decode the text. From abc00312f80f8ce9ccbd5966eff96281e24aff95 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Wed, 15 Apr 2026 14:21:49 +0200 Subject: [PATCH 8/8] Rerun modular_model_converter.py for qwen3_omni_moe --- .../models/qwen3_omni_moe/processing_qwen3_omni_moe.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py index 9ab134377829..f8fa23ee31ba 100644 --- a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py @@ -318,9 +318,6 @@ def _iter(): return list(_iter()) - def apply_chat_template(self, conversations, chat_template=None, **kwargs): - return super().apply_chat_template(conversations, chat_template, **kwargs) - def post_process_image_text_to_text(self, generated_outputs, skip_special_tokens=True, **kwargs): """ Post-process the output of a vlm to decode the text. @@ -392,5 +389,8 @@ def model_input_names(self): ) ) + def apply_chat_template(self, conversations, chat_template=None, **kwargs): + return super().apply_chat_template(conversations, chat_template, **kwargs) + __all__ = ["Qwen3OmniMoeProcessor"]