From 112ed9c00d73e3d0918f1c009e0b5b638117d72d Mon Sep 17 00:00:00 2001
From: Richard Liu <2645345468@qq.com>
Date: Mon, 23 Mar 2026 00:22:49 +0800
Subject: [PATCH] fix: align mimo tts style payload with official docs
---
astrbot/core/config/default.py | 6 +-
.../provider/sources/mimo_tts_api_source.py | 56 ++++++++----
.../en-US/features/config-metadata.json | 6 +-
.../ru-RU/features/config-metadata.json | 6 +-
.../zh-CN/features/config-metadata.json | 6 +-
tests/test_mimo_api_sources.py | 90 ++++++++++++++++---
6 files changed, 129 insertions(+), 41 deletions(-)
diff --git a/astrbot/core/config/default.py b/astrbot/core/config/default.py
index 20d9e06134..0f43dbd06d 100644
--- a/astrbot/core/config/default.py
+++ b/astrbot/core/config/default.py
@@ -2442,17 +2442,17 @@ class ChatProviderTemplate(TypedDict):
"mimo-tts-style-prompt": {
"description": "风格提示词",
"type": "string",
- "hint": "用于控制生成语音的说话风格、语气或情绪,例如温柔、活泼、沉稳等。可留空。",
+ "hint": "会以 标签形式添加到待合成文本开头,用于控制语速、情绪、角色或风格,例如 开心、变快、孙悟空、悄悄话。可留空。",
},
"mimo-tts-dialect": {
"description": "方言",
"type": "string",
- "hint": "指定生成语音时使用的方言或口音,例如四川话、粤语口音等。可留空。",
+ "hint": "会与风格提示词一起写入开头的 标签中,例如 东北话、四川话、河南话、粤语。可留空。",
},
"mimo-tts-seed-text": {
"description": "种子文本",
"type": "string",
- "hint": "用于引导音色和说话方式的参考文本,会影响生成语音的表达风格。",
+ "hint": "作为可选的 user 消息发送,用于辅助调节语气和风格,不会拼接到待合成文本中。",
},
"fishaudio-tts-character": {
"description": "character",
diff --git a/astrbot/core/provider/sources/mimo_tts_api_source.py b/astrbot/core/provider/sources/mimo_tts_api_source.py
index daad55e1e6..2966bfb7d8 100644
--- a/astrbot/core/provider/sources/mimo_tts_api_source.py
+++ b/astrbot/core/provider/sources/mimo_tts_api_source.py
@@ -44,35 +44,53 @@ def __init__(
self.set_model(provider_config.get("model", DEFAULT_MIMO_TTS_MODEL))
self.client = create_http_client(self.timeout, self.proxy)
- def _build_user_prompt(self) -> str:
- prompt_parts: list[str] = []
+ def _build_user_prompt(self) -> str | None:
+ seed_text = self.seed_text.strip()
+ return seed_text or None
+
+ def _build_style_prefix(self) -> str:
+ style_parts: list[str] = []
if self.style_prompt.strip():
- prompt_parts.append(self.style_prompt.strip())
+ style_parts.append(self.style_prompt.strip())
if self.dialect.strip():
- prompt_parts.append(f"Please use {self.dialect.strip()} when speaking.")
+ style_parts.append(self.dialect.strip())
+
+ style_content = " ".join(style_parts).strip()
+ if not style_content:
+ return ""
- if not prompt_parts:
- return self.seed_text
+ # MiMo recommends using only the singing style tag at the very beginning.
+ if "唱歌" in style_content:
+ return ""
- if self.seed_text.strip():
- prompt_parts.append(self.seed_text.strip())
+ return f""
- return " ".join(prompt_parts)
+ def _build_assistant_content(self, text: str) -> str:
+ return f"{self._build_style_prefix()}{text}"
def _build_payload(self, text: str) -> dict:
- return {
- "model": self.model_name,
- "messages": [
+ messages: list[dict[str, str]] = []
+
+ user_prompt = self._build_user_prompt()
+ if user_prompt:
+ messages.append(
{
"role": "user",
- "content": self._build_user_prompt(),
- },
- {
- "role": "assistant",
- "content": text,
- },
- ],
+ "content": user_prompt,
+ }
+ )
+
+ messages.append(
+ {
+ "role": "assistant",
+ "content": self._build_assistant_content(text),
+ }
+ )
+
+ return {
+ "model": self.model_name,
+ "messages": messages,
"audio": {
"format": self.audio_format,
"voice": self.voice,
diff --git a/dashboard/src/i18n/locales/en-US/features/config-metadata.json b/dashboard/src/i18n/locales/en-US/features/config-metadata.json
index 5beba764d2..43aae5984b 100644
--- a/dashboard/src/i18n/locales/en-US/features/config-metadata.json
+++ b/dashboard/src/i18n/locales/en-US/features/config-metadata.json
@@ -1457,15 +1457,15 @@
},
"mimo-tts-style-prompt": {
"description": "Style prompt",
- "hint": "Guides speaking style, tone, or emotion such as gentle, lively, or calm. Optional."
+ "hint": "Prepended to the synthesis target text as a tag to control speed, emotion, character, or style, such as happy, faster, Sun Wukong, or whispering. Optional."
},
"mimo-tts-dialect": {
"description": "Dialect",
- "hint": "Target dialect or accent for generated speech, such as Sichuan dialect. Optional."
+ "hint": "Combined with the style prompt inside the leading tag, for example Northeastern Mandarin, Sichuan dialect, Henan dialect, or Cantonese. Optional."
},
"mimo-tts-seed-text": {
"description": "Seed text",
- "hint": "Reference text used to guide voice characteristics and speaking style."
+ "hint": "Sent as an optional user message to help guide tone and speaking style. It is not appended to the synthesis target text."
},
"fishaudio-tts-character": {
"description": "character",
diff --git a/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json b/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json
index 3e96c8f256..06f60dd40a 100644
--- a/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json
+++ b/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json
@@ -1454,15 +1454,15 @@
},
"mimo-tts-style-prompt": {
"description": "Подсказка стиля",
- "hint": "Задает стиль речи, тон или эмоцию, например мягкий, живой или спокойный. Необязательно."
+ "hint": "Добавляется в начало синтезируемого текста в виде тега и управляет скоростью, эмоцией, ролью или манерой речи. Необязательно."
},
"mimo-tts-dialect": {
"description": "Диалект",
- "hint": "Диалект или акцент для синтезируемой речи, например сычуаньский диалект. Необязательно."
+ "hint": "Объединяется с подсказкой стиля внутри начального тега , например северо-восточный, сычуаньский, хэнаньский или кантонский вариант речи. Необязательно."
},
"mimo-tts-seed-text": {
"description": "Начальный текст",
- "hint": "Эталонный текст, который помогает задать особенности голоса и манеру речи."
+ "hint": "Отправляется как необязательное user-сообщение для настройки тона и манеры речи. Не добавляется к самому тексту синтеза."
},
"fishaudio-tts-character": {
"description": "Персонаж",
diff --git a/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json b/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json
index 9cbbd38de6..7b59a981d5 100644
--- a/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json
+++ b/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json
@@ -1459,15 +1459,15 @@
},
"mimo-tts-style-prompt": {
"description": "风格提示词",
- "hint": "用于控制生成语音的说话风格、语气或情绪,例如温柔、活泼、沉稳等。可留空。"
+ "hint": "会以 标签形式添加到待合成文本开头,用于控制语速、情绪、角色或风格,例如 开心、变快、孙悟空、悄悄话。可留空。"
},
"mimo-tts-dialect": {
"description": "方言",
- "hint": "指定生成语音时使用的方言或口音,例如四川话、粤语口音等。可留空。"
+ "hint": "会与风格提示词一起写入开头的 标签中,例如 东北话、四川话、河南话、粤语。可留空。"
},
"mimo-tts-seed-text": {
"description": "种子文本",
- "hint": "用于引导音色和说话方式的参考文本,会影响生成语音的表达风格。"
+ "hint": "作为可选的 user 消息发送,用于辅助调节语气和风格,不会拼接到待合成文本中。"
},
"fishaudio-tts-character": {
"description": "character",
diff --git a/tests/test_mimo_api_sources.py b/tests/test_mimo_api_sources.py
index d262f1c275..c2b02aa136 100644
--- a/tests/test_mimo_api_sources.py
+++ b/tests/test_mimo_api_sources.py
@@ -35,7 +35,7 @@ def _make_stt_provider(overrides: dict | None = None) -> ProviderMiMoSTTAPI:
return ProviderMiMoSTTAPI(provider_config=provider_config, provider_settings={})
-def test_mimo_tts_prompt_returns_seed_text_when_no_style_or_dialect():
+def test_mimo_tts_user_prompt_returns_seed_text():
provider = _make_tts_provider()
try:
assert provider._build_user_prompt() == "seed text"
@@ -43,21 +43,88 @@ def test_mimo_tts_prompt_returns_seed_text_when_no_style_or_dialect():
asyncio.run(provider.terminate())
-def test_mimo_tts_payload_includes_dialect_and_style_prompt():
+def test_mimo_tts_assistant_content_prefixes_style_and_dialect():
provider = _make_tts_provider(
{
- "mimo-tts-style-prompt": "Please sound cheerful and lively.",
- "mimo-tts-dialect": "Sichuan dialect",
+ "mimo-tts-style-prompt": "开心",
+ "mimo-tts-dialect": "四川话",
"mimo-tts-seed-text": "You are chatting with a close friend.",
}
)
try:
payload = provider._build_payload("hello")
- assert payload["messages"][0]["content"] == (
- "Please sound cheerful and lively. "
- "Please use Sichuan dialect when speaking. "
- "You are chatting with a close friend."
- )
+ assert payload["messages"][0] == {
+ "role": "user",
+ "content": "You are chatting with a close friend.",
+ }
+ assert payload["messages"][1]["content"] == "hello"
+ finally:
+ asyncio.run(provider.terminate())
+
+
+def test_mimo_tts_payload_omits_user_message_without_seed_text():
+ provider = _make_tts_provider(
+ {
+ "mimo-tts-seed-text": "",
+ "mimo-tts-style-prompt": "开心",
+ }
+ )
+ try:
+ payload = provider._build_payload("hello")
+ assert payload["messages"] == [
+ {
+ "role": "assistant",
+ "content": "hello",
+ }
+ ]
+ finally:
+ asyncio.run(provider.terminate())
+
+
+def test_mimo_tts_singing_style_uses_single_style_tag():
+ provider = _make_tts_provider(
+ {
+ "mimo-tts-style-prompt": "唱歌 开心",
+ "mimo-tts-dialect": "粤语",
+ }
+ )
+ try:
+ payload = provider._build_payload("歌词")
+ assert payload["messages"][1]["content"] == "歌词"
+ finally:
+ asyncio.run(provider.terminate())
+
+
+def test_mimo_tts_plain_text_stays_in_assistant_message_when_no_style():
+ provider = _make_tts_provider(
+ {
+ "mimo-tts-seed-text": "",
+ }
+ )
+ try:
+ payload = provider._build_payload("hello")
+ assert payload["messages"] == [
+ {
+ "role": "assistant",
+ "content": "hello",
+ }
+ ]
+ finally:
+ asyncio.run(provider.terminate())
+
+
+def test_mimo_tts_seed_text_is_not_prepended_to_assistant_content():
+ provider = _make_tts_provider(
+ {
+ "mimo-tts-style-prompt": "开心",
+ "mimo-tts-seed-text": "reference text",
+ }
+ )
+ try:
+ payload = provider._build_payload("明天就是周五了")
+ assert payload["messages"][0]["content"] == "reference text"
+ assert payload["messages"][1]["content"] == "明天就是周五了"
+ assert "reference text" not in payload["messages"][1]["content"]
finally:
asyncio.run(provider.terminate())
@@ -129,7 +196,10 @@ async def fake_post(_url, headers=None, json=None):
assert result == "transcribed text"
assert captured["json"]["messages"][0]["content"] == "system prompt"
assert captured["json"]["messages"][1]["content"][0]["type"] == "input_audio"
- assert captured["json"]["messages"][1]["content"][0]["input_audio"]["data"] == "ZmFrZQ=="
+ assert (
+ captured["json"]["messages"][1]["content"][0]["input_audio"]["data"]
+ == "ZmFrZQ=="
+ )
assert captured["json"]["messages"][1]["content"][1]["text"] == "user prompt"