From e20a71fb01db04a039901997a6f0f9acef8d307b Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Wed, 15 Apr 2026 13:19:50 +0200
Subject: [PATCH 1/8] Make Qwen2_5OmniProcessor warning a lot less noisy via
 warning_once

---
 .../models/qwen2_5_omni/processing_qwen2_5_omni.py        | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
index dcc98856ddc2..c37b64e4e040 100644
--- a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
+++ b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
@@ -16,7 +16,6 @@
 Processor class for Qwen2.5Omni.
 """
 
-import logging
 import re
 
 import numpy as np
@@ -25,10 +24,13 @@
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
 from ...tokenization_utils_base import AudioInput, PreTokenizedInput, TextInput
-from ...utils import auto_docstring
+from ...utils import auto_docstring, logging
 from ...video_utils import VideoInput
 
 
+logger = logging.get_logger(__name__)
+
+
 # Redefine kwargs for videos because Qwen-Omni uses some kwargs for processing omni
 # and does not use them in video processor class
 class Qwen2_5_OmniVideosKwargs(VideosKwargs, total=False):
@@ -313,7 +315,7 @@ def apply_chat_template(self, conversations, chat_template=None, **kwargs):
                 or conversation[0]["content"][0]["text"]
                 != "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."
             ):
-                logging.warning(
+                logger.warning_once(
                     "System prompt modified, audio output may not work as expected. "
                     + "Audio output mode only works when using default system prompt 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'"
                 )

From 4e0de1b623b870d6b079f99ea487aac68e3b85cb Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Wed, 15 Apr 2026 13:25:25 +0200
Subject: [PATCH 2/8] Bonus request: allow fully disabling the warning via
 config

---
 .../qwen2_5_omni/processing_qwen2_5_omni.py   | 30 ++++++++++++-------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
index c37b64e4e040..a08d7cf64fbc 100644
--- a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
+++ b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
@@ -107,7 +107,13 @@ class Qwen2_5OmniProcessorKwargs(ProcessingKwargs, total=False):
 @auto_docstring
 class Qwen2_5OmniProcessor(ProcessorMixin):
     def __init__(
-        self, image_processor=None, video_processor=None, feature_extractor=None, tokenizer=None, chat_template=None
+        self,
+        image_processor=None,
+        video_processor=None,
+        feature_extractor=None,
+        tokenizer=None,
+        chat_template=None,
+        check_audio_system_prompt: bool = True,
     ):
         super().__init__(image_processor, video_processor, feature_extractor, tokenizer, chat_template=chat_template)
         self.image_token = self.tokenizer.image_token
@@ -117,6 +123,7 @@ def __init__(
         self.vision_eos_token = self.tokenizer.vision_eos_token
         self.audio_bos_token = self.tokenizer.audio_bos_token
         self.audio_eos_token = self.tokenizer.audio_eos_token
+        self.check_audio_system_prompt = check_audio_system_prompt
 
     @auto_docstring
     def __call__(
@@ -309,16 +316,17 @@ def apply_chat_template(self, conversations, chat_template=None, **kwargs):
             conversations = [conversations]
             is_batched = True
 
-        for conversation in conversations:
-            if (
-                conversation[0]["role"] != "system"
-                or conversation[0]["content"][0]["text"]
-                != "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."
-            ):
-                logger.warning_once(
-                    "System prompt modified, audio output may not work as expected. "
-                    + "Audio output mode only works when using default system prompt 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'"
-                )
+        if self.check_audio_system_prompt:
+            for conversation in conversations:
+                if (
+                    conversation[0]["role"] != "system"
+                    or conversation[0]["content"][0]["text"]
+                    != "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."
+                ):
+                    logger.warning_once(
+                        "System prompt modified, audio output may not work as expected. "
+                        + "Audio output mode only works when using default system prompt 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'"
+                    )
         if is_batched:
             conversations = conversations[0]
 

From 2ad8a4e3aecb424703288e0067ac353cd7cea463 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Wed, 15 Apr 2026 13:37:41 +0200
Subject: [PATCH 3/8] Add test to show that warning can be quietened with this

---
 .../test_processing_qwen2_5_omni.py           | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/tests/models/qwen2_5_omni/test_processing_qwen2_5_omni.py b/tests/models/qwen2_5_omni/test_processing_qwen2_5_omni.py
index ce83b13a3f19..f3e6c59d4957 100644
--- a/tests/models/qwen2_5_omni/test_processing_qwen2_5_omni.py
+++ b/tests/models/qwen2_5_omni/test_processing_qwen2_5_omni.py
@@ -13,6 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import inspect
+import json
+import os
+import tempfile
 import unittest
 
 import numpy as np
@@ -353,3 +356,29 @@ def test_chat_template_audio_from_video(self):
         # Qwen pixel values are flattened, verify length matches video_grid_thw
         expected_video_tokens = sum(thw[0] * thw[1] * thw[2] for thw in out_dict["video_grid_thw"])
         self.assertEqual(len(out_dict[self.videos_input_name]), expected_video_tokens)  # 1 video in the conversation
+
+    def test_check_audio_system_prompt_round_trip(self):
+        processor = self.get_processor()
+        self.assertTrue(processor.check_audio_system_prompt)
+
+        processor.check_audio_system_prompt = False
+        with tempfile.TemporaryDirectory() as tmpdir:
+            processor.save_pretrained(tmpdir)
+
+            with open(os.path.join(tmpdir, "processor_config.json")) as f:
+                saved_config = json.load(f)
+            self.assertFalse(saved_config["check_audio_system_prompt"])
+
+            reloaded = self.processor_class.from_pretrained(tmpdir)
+        self.assertFalse(reloaded.check_audio_system_prompt)
+
+        # With the flag disabled, a non-default system prompt must not produce a warning.
+        messages = [{"role": "user", "content": [{"type": "text", "text": "hi"}]}]
+        with self.assertNoLogs("transformers.models.qwen2_5_omni.processing_qwen2_5_omni", level="WARNING"):
+            reloaded.apply_chat_template(messages, tokenize=False)
+
+        # While with the flag enabled, we'll get a warning
+        reloaded.check_audio_system_prompt = True
+        messages = [{"role": "user", "content": [{"type": "text", "text": "hi"}]}]
+        with self.assertLogs("transformers.models.qwen2_5_omni.processing_qwen2_5_omni", level="WARNING"):
+            reloaded.apply_chat_template(messages, tokenize=False)

From e3f8ea939cf9de9144d436b5025af3617dd6e7af Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Wed, 15 Apr 2026 13:53:12 +0200
Subject: [PATCH 4/8] Revert "Make Qwen2_5OmniProcessor warning a lot less
 noisy via warning_once"

This reverts commit e20a71fb01db04a039901997a6f0f9acef8d307b to 2ad8a4e3aecb424703288e0067ac353cd7cea463
---
 .../qwen2_5_omni/processing_qwen2_5_omni.py   | 36 +++++++------------
 .../test_processing_qwen2_5_omni.py           | 29 ---------------
 2 files changed, 13 insertions(+), 52 deletions(-)

diff --git a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
index a08d7cf64fbc..dcc98856ddc2 100644
--- a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
+++ b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
@@ -16,6 +16,7 @@
 Processor class for Qwen2.5Omni.
 """
 
+import logging
 import re
 
 import numpy as np
@@ -24,13 +25,10 @@
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
 from ...tokenization_utils_base import AudioInput, PreTokenizedInput, TextInput
-from ...utils import auto_docstring, logging
+from ...utils import auto_docstring
 from ...video_utils import VideoInput
 
 
-logger = logging.get_logger(__name__)
-
-
 # Redefine kwargs for videos because Qwen-Omni uses some kwargs for processing omni
 # and does not use them in video processor class
 class Qwen2_5_OmniVideosKwargs(VideosKwargs, total=False):
@@ -107,13 +105,7 @@ class Qwen2_5OmniProcessorKwargs(ProcessingKwargs, total=False):
 @auto_docstring
 class Qwen2_5OmniProcessor(ProcessorMixin):
     def __init__(
-        self,
-        image_processor=None,
-        video_processor=None,
-        feature_extractor=None,
-        tokenizer=None,
-        chat_template=None,
-        check_audio_system_prompt: bool = True,
+        self, image_processor=None, video_processor=None, feature_extractor=None, tokenizer=None, chat_template=None
     ):
         super().__init__(image_processor, video_processor, feature_extractor, tokenizer, chat_template=chat_template)
         self.image_token = self.tokenizer.image_token
@@ -123,7 +115,6 @@ def __init__(
         self.vision_eos_token = self.tokenizer.vision_eos_token
         self.audio_bos_token = self.tokenizer.audio_bos_token
         self.audio_eos_token = self.tokenizer.audio_eos_token
-        self.check_audio_system_prompt = check_audio_system_prompt
 
     @auto_docstring
     def __call__(
@@ -316,17 +307,16 @@ def apply_chat_template(self, conversations, chat_template=None, **kwargs):
             conversations = [conversations]
             is_batched = True
 
-        if self.check_audio_system_prompt:
-            for conversation in conversations:
-                if (
-                    conversation[0]["role"] != "system"
-                    or conversation[0]["content"][0]["text"]
-                    != "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."
-                ):
-                    logger.warning_once(
-                        "System prompt modified, audio output may not work as expected. "
-                        + "Audio output mode only works when using default system prompt 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'"
-                    )
+        for conversation in conversations:
+            if (
+                conversation[0]["role"] != "system"
+                or conversation[0]["content"][0]["text"]
+                != "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."
+            ):
+                logging.warning(
+                    "System prompt modified, audio output may not work as expected. "
+                    + "Audio output mode only works when using default system prompt 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'"
+                )
         if is_batched:
             conversations = conversations[0]
 
diff --git a/tests/models/qwen2_5_omni/test_processing_qwen2_5_omni.py b/tests/models/qwen2_5_omni/test_processing_qwen2_5_omni.py
index f3e6c59d4957..ce83b13a3f19 100644
--- a/tests/models/qwen2_5_omni/test_processing_qwen2_5_omni.py
+++ b/tests/models/qwen2_5_omni/test_processing_qwen2_5_omni.py
@@ -13,9 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import inspect
-import json
-import os
-import tempfile
 import unittest
 
 import numpy as np
@@ -356,29 +353,3 @@ def test_chat_template_audio_from_video(self):
         # Qwen pixel values are flattened, verify length matches video_grid_thw
         expected_video_tokens = sum(thw[0] * thw[1] * thw[2] for thw in out_dict["video_grid_thw"])
         self.assertEqual(len(out_dict[self.videos_input_name]), expected_video_tokens)  # 1 video in the conversation
-
-    def test_check_audio_system_prompt_round_trip(self):
-        processor = self.get_processor()
-        self.assertTrue(processor.check_audio_system_prompt)
-
-        processor.check_audio_system_prompt = False
-        with tempfile.TemporaryDirectory() as tmpdir:
-            processor.save_pretrained(tmpdir)
-
-            with open(os.path.join(tmpdir, "processor_config.json")) as f:
-                saved_config = json.load(f)
-            self.assertFalse(saved_config["check_audio_system_prompt"])
-
-            reloaded = self.processor_class.from_pretrained(tmpdir)
-        self.assertFalse(reloaded.check_audio_system_prompt)
-
-        # With the flag disabled, a non-default system prompt must not produce a warning.
-        messages = [{"role": "user", "content": [{"type": "text", "text": "hi"}]}]
-        with self.assertNoLogs("transformers.models.qwen2_5_omni.processing_qwen2_5_omni", level="WARNING"):
-            reloaded.apply_chat_template(messages, tokenize=False)
-
-        # While with the flag enabled, we'll get a warning
-        reloaded.check_audio_system_prompt = True
-        messages = [{"role": "user", "content": [{"type": "text", "text": "hi"}]}]
-        with self.assertLogs("transformers.models.qwen2_5_omni.processing_qwen2_5_omni", level="WARNING"):
-            reloaded.apply_chat_template(messages, tokenize=False)

From 1f77f89612dd8e2f1554840214ff0cabcc111d79 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Wed, 15 Apr 2026 13:53:31 +0200
Subject: [PATCH 5/8] Remove system prompt warning

---
 .../models/qwen2_5_omni/processing_qwen2_5_omni.py    | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
index dcc98856ddc2..927ea350b5ca 100644
--- a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
+++ b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
@@ -306,17 +306,6 @@ def apply_chat_template(self, conversations, chat_template=None, **kwargs):
         if isinstance(conversations[0], dict):
             conversations = [conversations]
             is_batched = True
-
-        for conversation in conversations:
-            if (
-                conversation[0]["role"] != "system"
-                or conversation[0]["content"][0]["text"]
-                != "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."
-            ):
-                logging.warning(
-                    "System prompt modified, audio output may not work as expected. "
-                    + "Audio output mode only works when using default system prompt 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'"
-                )
         if is_batched:
             conversations = conversations[0]
 

From 9e95eedf10061ff4b032cfe6e631f79d1b820afb Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Wed, 15 Apr 2026 13:55:20 +0200
Subject: [PATCH 6/8] Make style

---
 src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
index 927ea350b5ca..0c1cf90c7849 100644
--- a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
+++ b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
@@ -16,7 +16,6 @@
 Processor class for Qwen2.5Omni.
 """
 
-import logging
 import re
 
 import numpy as np

From e0d7086c06011ad448407d2ae0dba6fa6e8028be Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Wed, 15 Apr 2026 14:16:04 +0200
Subject: [PATCH 7/8] Fully remove apply_chat_template!

---
 .../models/qwen2_5_omni/processing_qwen2_5_omni.py     | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
index 0c1cf90c7849..5f5b6584862a 100644
--- a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
+++ b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
@@ -300,16 +300,6 @@ def _iter():
 
         return list(_iter())
 
-    def apply_chat_template(self, conversations, chat_template=None, **kwargs):
-        is_batched = False
-        if isinstance(conversations[0], dict):
-            conversations = [conversations]
-            is_batched = True
-        if is_batched:
-            conversations = conversations[0]
-
-        return super().apply_chat_template(conversations, chat_template, **kwargs)
-
     def post_process_image_text_to_text(self, generated_outputs, skip_special_tokens=True, **kwargs):
         """
         Post-process the output of a vlm to decode the text.

From abc00312f80f8ce9ccbd5966eff96281e24aff95 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Wed, 15 Apr 2026 14:21:49 +0200
Subject: [PATCH 8/8] Rerun modular_model_converter.py for qwen3_omni_moe

---
 .../models/qwen3_omni_moe/processing_qwen3_omni_moe.py      | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py
index 9ab134377829..f8fa23ee31ba 100644
--- a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py
+++ b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py
@@ -318,9 +318,6 @@ def _iter():
 
         return list(_iter())
 
-    def apply_chat_template(self, conversations, chat_template=None, **kwargs):
-        return super().apply_chat_template(conversations, chat_template, **kwargs)
-
     def post_process_image_text_to_text(self, generated_outputs, skip_special_tokens=True, **kwargs):
         """
         Post-process the output of a vlm to decode the text.
@@ -392,5 +389,8 @@ def model_input_names(self):
             )
         )
 
+    def apply_chat_template(self, conversations, chat_template=None, **kwargs):
+        return super().apply_chat_template(conversations, chat_template, **kwargs)
+
 
 __all__ = ["Qwen3OmniMoeProcessor"]