From 112ed9c00d73e3d0918f1c009e0b5b638117d72d Mon Sep 17 00:00:00 2001
From: Richard Liu <2645345468@qq.com>
Date: Mon, 23 Mar 2026 00:22:49 +0800
Subject: [PATCH] fix: align mimo tts style payload with official docs

---
 astrbot/core/config/default.py                |  6 +-
 .../provider/sources/mimo_tts_api_source.py   | 56 ++++++++----
 .../en-US/features/config-metadata.json       |  6 +-
 .../ru-RU/features/config-metadata.json       |  6 +-
 .../zh-CN/features/config-metadata.json       |  6 +-
 tests/test_mimo_api_sources.py                | 90 ++++++++++++++++---
 6 files changed, 129 insertions(+), 41 deletions(-)
diff --git a/astrbot/core/config/default.py b/astrbot/core/config/default.py
index 20d9e06134..0f43dbd06d 100644
--- a/astrbot/core/config/default.py
+++ b/astrbot/core/config/default.py
@@ -2442,17 +2442,17 @@ class ChatProviderTemplate(TypedDict):
                     "mimo-tts-style-prompt": {
                         "description": "风格提示词",
                         "type": "string",
-                        "hint": "用于控制生成语音的说话风格、语气或情绪，例如温柔、活泼、沉稳等。可留空。",
+                        "hint": "会以 <style>...</style> 标签形式添加到待合成文本开头，用于控制语速、情绪、角色或风格，例如 开心、变快、孙悟空、悄悄话。可留空。",
                     },
                     "mimo-tts-dialect": {
                         "description": "方言",
                         "type": "string",
-                        "hint": "指定生成语音时使用的方言或口音，例如四川话、粤语口音等。可留空。",
+                        "hint": "会与风格提示词一起写入开头的 <style>...</style> 标签中，例如 东北话、四川话、河南话、粤语。可留空。",
                     },
                     "mimo-tts-seed-text": {
                         "description": "种子文本",
                         "type": "string",
-                        "hint": "用于引导音色和说话方式的参考文本，会影响生成语音的表达风格。",
+                        "hint": "作为可选的 user 消息发送，用于辅助调节语气和风格，不会拼接到待合成文本中。",
                     },
                     "fishaudio-tts-character": {
                         "description": "character",
diff --git a/astrbot/core/provider/sources/mimo_tts_api_source.py b/astrbot/core/provider/sources/mimo_tts_api_source.py
index daad55e1e6..2966bfb7d8 100644
--- a/astrbot/core/provider/sources/mimo_tts_api_source.py
+++ b/astrbot/core/provider/sources/mimo_tts_api_source.py
@@ -44,35 +44,53 @@ def __init__(
         self.set_model(provider_config.get("model", DEFAULT_MIMO_TTS_MODEL))
         self.client = create_http_client(self.timeout, self.proxy)
 
-    def _build_user_prompt(self) -> str:
-        prompt_parts: list[str] = []
+    def _build_user_prompt(self) -> str | None:
+        seed_text = self.seed_text.strip()
+        return seed_text or None
+
+    def _build_style_prefix(self) -> str:
+        style_parts: list[str] = []
 
         if self.style_prompt.strip():
-            prompt_parts.append(self.style_prompt.strip())
+            style_parts.append(self.style_prompt.strip())
         if self.dialect.strip():
-            prompt_parts.append(f"Please use {self.dialect.strip()} when speaking.")
+            style_parts.append(self.dialect.strip())
+
+        style_content = " ".join(style_parts).strip()
+        if not style_content:
+            return ""
 
-        if not prompt_parts:
-            return self.seed_text
+        # MiMo recommends using only the singing style tag at the very beginning.
+        if "唱歌" in style_content:
+            return "<style>唱歌</style>"
 
-        if self.seed_text.strip():
-            prompt_parts.append(self.seed_text.strip())
+        return f"<style>{style_content}</style>"
 
-        return " ".join(prompt_parts)
+    def _build_assistant_content(self, text: str) -> str:
+        return f"{self._build_style_prefix()}{text}"
 
     def _build_payload(self, text: str) -> dict:
-        return {
-            "model": self.model_name,
-            "messages": [
+        messages: list[dict[str, str]] = []
+
+        user_prompt = self._build_user_prompt()
+        if user_prompt:
+            messages.append(
                 {
                     "role": "user",
-                    "content": self._build_user_prompt(),
-                },
-                {
-                    "role": "assistant",
-                    "content": text,
-                },
-            ],
+                    "content": user_prompt,
+                }
+            )
+
+        messages.append(
+            {
+                "role": "assistant",
+                "content": self._build_assistant_content(text),
+            }
+        )
+
+        return {
+            "model": self.model_name,
+            "messages": messages,
             "audio": {
                 "format": self.audio_format,
                 "voice": self.voice,
diff --git a/dashboard/src/i18n/locales/en-US/features/config-metadata.json b/dashboard/src/i18n/locales/en-US/features/config-metadata.json
index 5beba764d2..43aae5984b 100644
--- a/dashboard/src/i18n/locales/en-US/features/config-metadata.json
+++ b/dashboard/src/i18n/locales/en-US/features/config-metadata.json
@@ -1457,15 +1457,15 @@
       },
       "mimo-tts-style-prompt": {
         "description": "Style prompt",
-        "hint": "Guides speaking style, tone, or emotion such as gentle, lively, or calm. Optional."
+        "hint": "Prepended to the synthesis target text as a <style>...</style> tag to control speed, emotion, character, or style, such as happy, faster, Sun Wukong, or whispering. Optional."
       },
       "mimo-tts-dialect": {
         "description": "Dialect",
-        "hint": "Target dialect or accent for generated speech, such as Sichuan dialect. Optional."
+        "hint": "Combined with the style prompt inside the leading <style>...</style> tag, for example Northeastern Mandarin, Sichuan dialect, Henan dialect, or Cantonese. Optional."
       },
       "mimo-tts-seed-text": {
         "description": "Seed text",
-        "hint": "Reference text used to guide voice characteristics and speaking style."
+        "hint": "Sent as an optional user message to help guide tone and speaking style. It is not appended to the synthesis target text."
       },
       "fishaudio-tts-character": {
         "description": "character",
diff --git a/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json b/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json
index 3e96c8f256..06f60dd40a 100644
--- a/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json
+++ b/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json
@@ -1454,15 +1454,15 @@
             },
             "mimo-tts-style-prompt": {
                 "description": "Подсказка стиля",
-                "hint": "Задает стиль речи, тон или эмоцию, например мягкий, живой или спокойный. Необязательно."
+                "hint": "Добавляется в начало синтезируемого текста в виде тега <style>...</style> и управляет скоростью, эмоцией, ролью или манерой речи. Необязательно."
             },
             "mimo-tts-dialect": {
                 "description": "Диалект",
-                "hint": "Диалект или акцент для синтезируемой речи, например сычуаньский диалект. Необязательно."
+                "hint": "Объединяется с подсказкой стиля внутри начального тега <style>...</style>, например северо-восточный, сычуаньский, хэнаньский или кантонский вариант речи. Необязательно."
             },
             "mimo-tts-seed-text": {
                 "description": "Начальный текст",
-                "hint": "Эталонный текст, который помогает задать особенности голоса и манеру речи."
+                "hint": "Отправляется как необязательное user-сообщение для настройки тона и манеры речи. Не добавляется к самому тексту синтеза."
             },
             "fishaudio-tts-character": {
                 "description": "Персонаж",
diff --git a/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json b/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json
index 9cbbd38de6..7b59a981d5 100644
--- a/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json
+++ b/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json
@@ -1459,15 +1459,15 @@
       },
       "mimo-tts-style-prompt": {
         "description": "风格提示词",
-        "hint": "用于控制生成语音的说话风格、语气或情绪，例如温柔、活泼、沉稳等。可留空。"
+        "hint": "会以 <style>...</style> 标签形式添加到待合成文本开头，用于控制语速、情绪、角色或风格，例如 开心、变快、孙悟空、悄悄话。可留空。"
       },
       "mimo-tts-dialect": {
         "description": "方言",
-        "hint": "指定生成语音时使用的方言或口音，例如四川话、粤语口音等。可留空。"
+        "hint": "会与风格提示词一起写入开头的 <style>...</style> 标签中，例如 东北话、四川话、河南话、粤语。可留空。"
       },
       "mimo-tts-seed-text": {
         "description": "种子文本",
-        "hint": "用于引导音色和说话方式的参考文本，会影响生成语音的表达风格。"
+        "hint": "作为可选的 user 消息发送，用于辅助调节语气和风格，不会拼接到待合成文本中。"
       },
       "fishaudio-tts-character": {
         "description": "character",
diff --git a/tests/test_mimo_api_sources.py b/tests/test_mimo_api_sources.py
index d262f1c275..c2b02aa136 100644
--- a/tests/test_mimo_api_sources.py
+++ b/tests/test_mimo_api_sources.py
@@ -35,7 +35,7 @@ def _make_stt_provider(overrides: dict | None = None) -> ProviderMiMoSTTAPI:
     return ProviderMiMoSTTAPI(provider_config=provider_config, provider_settings={})
 
 
-def test_mimo_tts_prompt_returns_seed_text_when_no_style_or_dialect():
+def test_mimo_tts_user_prompt_returns_seed_text():
     provider = _make_tts_provider()
     try:
         assert provider._build_user_prompt() == "seed text"
@@ -43,21 +43,88 @@ def test_mimo_tts_prompt_returns_seed_text_when_no_style_or_dialect():
         asyncio.run(provider.terminate())
 
 
-def test_mimo_tts_payload_includes_dialect_and_style_prompt():
+def test_mimo_tts_assistant_content_prefixes_style_and_dialect():
     provider = _make_tts_provider(
         {
-            "mimo-tts-style-prompt": "Please sound cheerful and lively.",
-            "mimo-tts-dialect": "Sichuan dialect",
+            "mimo-tts-style-prompt": "开心",
+            "mimo-tts-dialect": "四川话",
             "mimo-tts-seed-text": "You are chatting with a close friend.",
         }
     )
     try:
         payload = provider._build_payload("hello")
-        assert payload["messages"][0]["content"] == (
-            "Please sound cheerful and lively. "
-            "Please use Sichuan dialect when speaking. "
-            "You are chatting with a close friend."
-        )
+        assert payload["messages"][0] == {
+            "role": "user",
+            "content": "You are chatting with a close friend.",
+        }
+        assert payload["messages"][1]["content"] == "<style>开心 四川话</style>hello"
+    finally:
+        asyncio.run(provider.terminate())
+
+
+def test_mimo_tts_payload_omits_user_message_without_seed_text():
+    provider = _make_tts_provider(
+        {
+            "mimo-tts-seed-text": "",
+            "mimo-tts-style-prompt": "开心",
+        }
+    )
+    try:
+        payload = provider._build_payload("hello")
+        assert payload["messages"] == [
+            {
+                "role": "assistant",
+                "content": "<style>开心</style>hello",
+            }
+        ]
+    finally:
+        asyncio.run(provider.terminate())
+
+
+def test_mimo_tts_singing_style_uses_single_style_tag():
+    provider = _make_tts_provider(
+        {
+            "mimo-tts-style-prompt": "唱歌 开心",
+            "mimo-tts-dialect": "粤语",
+        }
+    )
+    try:
+        payload = provider._build_payload("歌词")
+        assert payload["messages"][1]["content"] == "<style>唱歌</style>歌词"
+    finally:
+        asyncio.run(provider.terminate())
+
+
+def test_mimo_tts_plain_text_stays_in_assistant_message_when_no_style():
+    provider = _make_tts_provider(
+        {
+            "mimo-tts-seed-text": "",
+        }
+    )
+    try:
+        payload = provider._build_payload("hello")
+        assert payload["messages"] == [
+            {
+                "role": "assistant",
+                "content": "hello",
+            }
+        ]
+    finally:
+        asyncio.run(provider.terminate())
+
+
+def test_mimo_tts_seed_text_is_not_prepended_to_assistant_content():
+    provider = _make_tts_provider(
+        {
+            "mimo-tts-style-prompt": "开心",
+            "mimo-tts-seed-text": "reference text",
+        }
+    )
+    try:
+        payload = provider._build_payload("明天就是周五了")
+        assert payload["messages"][0]["content"] == "reference text"
+        assert payload["messages"][1]["content"] == "<style>开心</style>明天就是周五了"
+        assert "reference text" not in payload["messages"][1]["content"]
     finally:
         asyncio.run(provider.terminate())
 
@@ -129,7 +196,10 @@ async def fake_post(_url, headers=None, json=None):
     assert result == "transcribed text"
     assert captured["json"]["messages"][0]["content"] == "system prompt"
     assert captured["json"]["messages"][1]["content"][0]["type"] == "input_audio"
-    assert captured["json"]["messages"][1]["content"][0]["input_audio"]["data"] == "ZmFrZQ=="
+    assert (
+        captured["json"]["messages"][1]["content"][0]["input_audio"]["data"]
+        == "ZmFrZQ=="
+    )
     assert captured["json"]["messages"][1]["content"][1]["text"] == "user prompt"