From 112ed9c00d73e3d0918f1c009e0b5b638117d72d Mon Sep 17 00:00:00 2001 From: Richard Liu <2645345468@qq.com> Date: Mon, 23 Mar 2026 00:22:49 +0800 Subject: [PATCH] fix: align mimo tts style payload with official docs --- astrbot/core/config/default.py | 6 +- .../provider/sources/mimo_tts_api_source.py | 56 ++++++++---- .../en-US/features/config-metadata.json | 6 +- .../ru-RU/features/config-metadata.json | 6 +- .../zh-CN/features/config-metadata.json | 6 +- tests/test_mimo_api_sources.py | 90 ++++++++++++++++--- 6 files changed, 129 insertions(+), 41 deletions(-) diff --git a/astrbot/core/config/default.py b/astrbot/core/config/default.py index 20d9e06134..0f43dbd06d 100644 --- a/astrbot/core/config/default.py +++ b/astrbot/core/config/default.py @@ -2442,17 +2442,17 @@ class ChatProviderTemplate(TypedDict): "mimo-tts-style-prompt": { "description": "风格提示词", "type": "string", - "hint": "用于控制生成语音的说话风格、语气或情绪,例如温柔、活泼、沉稳等。可留空。", + "hint": "会以 标签形式添加到待合成文本开头,用于控制语速、情绪、角色或风格,例如 开心、变快、孙悟空、悄悄话。可留空。", }, "mimo-tts-dialect": { "description": "方言", "type": "string", - "hint": "指定生成语音时使用的方言或口音,例如四川话、粤语口音等。可留空。", + "hint": "会与风格提示词一起写入开头的 标签中,例如 东北话、四川话、河南话、粤语。可留空。", }, "mimo-tts-seed-text": { "description": "种子文本", "type": "string", - "hint": "用于引导音色和说话方式的参考文本,会影响生成语音的表达风格。", + "hint": "作为可选的 user 消息发送,用于辅助调节语气和风格,不会拼接到待合成文本中。", }, "fishaudio-tts-character": { "description": "character", diff --git a/astrbot/core/provider/sources/mimo_tts_api_source.py b/astrbot/core/provider/sources/mimo_tts_api_source.py index daad55e1e6..2966bfb7d8 100644 --- a/astrbot/core/provider/sources/mimo_tts_api_source.py +++ b/astrbot/core/provider/sources/mimo_tts_api_source.py @@ -44,35 +44,53 @@ def __init__( self.set_model(provider_config.get("model", DEFAULT_MIMO_TTS_MODEL)) self.client = create_http_client(self.timeout, self.proxy) - def _build_user_prompt(self) -> str: - prompt_parts: list[str] = [] + def _build_user_prompt(self) -> str | None: + seed_text = self.seed_text.strip() + return seed_text or None + + def _build_style_prefix(self) -> str: + style_parts: list[str] = [] if self.style_prompt.strip(): - prompt_parts.append(self.style_prompt.strip()) + style_parts.append(self.style_prompt.strip()) if self.dialect.strip(): - prompt_parts.append(f"Please use {self.dialect.strip()} when speaking.") + style_parts.append(self.dialect.strip()) + + style_content = " ".join(style_parts).strip() + if not style_content: + return "" - if not prompt_parts: - return self.seed_text + # MiMo recommends using only the singing style tag at the very beginning. + if "唱歌" in style_content: + return "" - if self.seed_text.strip(): - prompt_parts.append(self.seed_text.strip()) + return f"" - return " ".join(prompt_parts) + def _build_assistant_content(self, text: str) -> str: + return f"{self._build_style_prefix()}{text}" def _build_payload(self, text: str) -> dict: - return { - "model": self.model_name, - "messages": [ + messages: list[dict[str, str]] = [] + + user_prompt = self._build_user_prompt() + if user_prompt: + messages.append( { "role": "user", - "content": self._build_user_prompt(), - }, - { - "role": "assistant", - "content": text, - }, - ], + "content": user_prompt, + } + ) + + messages.append( + { + "role": "assistant", + "content": self._build_assistant_content(text), + } + ) + + return { + "model": self.model_name, + "messages": messages, "audio": { "format": self.audio_format, "voice": self.voice, diff --git a/dashboard/src/i18n/locales/en-US/features/config-metadata.json b/dashboard/src/i18n/locales/en-US/features/config-metadata.json index 5beba764d2..43aae5984b 100644 --- a/dashboard/src/i18n/locales/en-US/features/config-metadata.json +++ b/dashboard/src/i18n/locales/en-US/features/config-metadata.json @@ -1457,15 +1457,15 @@ }, "mimo-tts-style-prompt": { "description": "Style prompt", - "hint": "Guides speaking style, tone, or emotion such as gentle, lively, or calm. Optional." + "hint": "Prepended to the synthesis target text as a tag to control speed, emotion, character, or style, such as happy, faster, Sun Wukong, or whispering. Optional." }, "mimo-tts-dialect": { "description": "Dialect", - "hint": "Target dialect or accent for generated speech, such as Sichuan dialect. Optional." + "hint": "Combined with the style prompt inside the leading tag, for example Northeastern Mandarin, Sichuan dialect, Henan dialect, or Cantonese. Optional." }, "mimo-tts-seed-text": { "description": "Seed text", - "hint": "Reference text used to guide voice characteristics and speaking style." + "hint": "Sent as an optional user message to help guide tone and speaking style. It is not appended to the synthesis target text." }, "fishaudio-tts-character": { "description": "character", diff --git a/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json b/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json index 3e96c8f256..06f60dd40a 100644 --- a/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json +++ b/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json @@ -1454,15 +1454,15 @@ }, "mimo-tts-style-prompt": { "description": "Подсказка стиля", - "hint": "Задает стиль речи, тон или эмоцию, например мягкий, живой или спокойный. Необязательно." + "hint": "Добавляется в начало синтезируемого текста в виде тега и управляет скоростью, эмоцией, ролью или манерой речи. Необязательно." }, "mimo-tts-dialect": { "description": "Диалект", - "hint": "Диалект или акцент для синтезируемой речи, например сычуаньский диалект. Необязательно." + "hint": "Объединяется с подсказкой стиля внутри начального тега , например северо-восточный, сычуаньский, хэнаньский или кантонский вариант речи. Необязательно." }, "mimo-tts-seed-text": { "description": "Начальный текст", - "hint": "Эталонный текст, который помогает задать особенности голоса и манеру речи." + "hint": "Отправляется как необязательное user-сообщение для настройки тона и манеры речи. Не добавляется к самому тексту синтеза." }, "fishaudio-tts-character": { "description": "Персонаж", diff --git a/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json b/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json index 9cbbd38de6..7b59a981d5 100644 --- a/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json +++ b/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json @@ -1459,15 +1459,15 @@ }, "mimo-tts-style-prompt": { "description": "风格提示词", - "hint": "用于控制生成语音的说话风格、语气或情绪,例如温柔、活泼、沉稳等。可留空。" + "hint": "会以 标签形式添加到待合成文本开头,用于控制语速、情绪、角色或风格,例如 开心、变快、孙悟空、悄悄话。可留空。" }, "mimo-tts-dialect": { "description": "方言", - "hint": "指定生成语音时使用的方言或口音,例如四川话、粤语口音等。可留空。" + "hint": "会与风格提示词一起写入开头的 标签中,例如 东北话、四川话、河南话、粤语。可留空。" }, "mimo-tts-seed-text": { "description": "种子文本", - "hint": "用于引导音色和说话方式的参考文本,会影响生成语音的表达风格。" + "hint": "作为可选的 user 消息发送,用于辅助调节语气和风格,不会拼接到待合成文本中。" }, "fishaudio-tts-character": { "description": "character", diff --git a/tests/test_mimo_api_sources.py b/tests/test_mimo_api_sources.py index d262f1c275..c2b02aa136 100644 --- a/tests/test_mimo_api_sources.py +++ b/tests/test_mimo_api_sources.py @@ -35,7 +35,7 @@ def _make_stt_provider(overrides: dict | None = None) -> ProviderMiMoSTTAPI: return ProviderMiMoSTTAPI(provider_config=provider_config, provider_settings={}) -def test_mimo_tts_prompt_returns_seed_text_when_no_style_or_dialect(): +def test_mimo_tts_user_prompt_returns_seed_text(): provider = _make_tts_provider() try: assert provider._build_user_prompt() == "seed text" @@ -43,21 +43,88 @@ def test_mimo_tts_prompt_returns_seed_text_when_no_style_or_dialect(): asyncio.run(provider.terminate()) -def test_mimo_tts_payload_includes_dialect_and_style_prompt(): +def test_mimo_tts_assistant_content_prefixes_style_and_dialect(): provider = _make_tts_provider( { - "mimo-tts-style-prompt": "Please sound cheerful and lively.", - "mimo-tts-dialect": "Sichuan dialect", + "mimo-tts-style-prompt": "开心", + "mimo-tts-dialect": "四川话", "mimo-tts-seed-text": "You are chatting with a close friend.", } ) try: payload = provider._build_payload("hello") - assert payload["messages"][0]["content"] == ( - "Please sound cheerful and lively. " - "Please use Sichuan dialect when speaking. " - "You are chatting with a close friend." - ) + assert payload["messages"][0] == { + "role": "user", + "content": "You are chatting with a close friend.", + } + assert payload["messages"][1]["content"] == "hello" + finally: + asyncio.run(provider.terminate()) + + +def test_mimo_tts_payload_omits_user_message_without_seed_text(): + provider = _make_tts_provider( + { + "mimo-tts-seed-text": "", + "mimo-tts-style-prompt": "开心", + } + ) + try: + payload = provider._build_payload("hello") + assert payload["messages"] == [ + { + "role": "assistant", + "content": "hello", + } + ] + finally: + asyncio.run(provider.terminate()) + + +def test_mimo_tts_singing_style_uses_single_style_tag(): + provider = _make_tts_provider( + { + "mimo-tts-style-prompt": "唱歌 开心", + "mimo-tts-dialect": "粤语", + } + ) + try: + payload = provider._build_payload("歌词") + assert payload["messages"][1]["content"] == "歌词" + finally: + asyncio.run(provider.terminate()) + + +def test_mimo_tts_plain_text_stays_in_assistant_message_when_no_style(): + provider = _make_tts_provider( + { + "mimo-tts-seed-text": "", + } + ) + try: + payload = provider._build_payload("hello") + assert payload["messages"] == [ + { + "role": "assistant", + "content": "hello", + } + ] + finally: + asyncio.run(provider.terminate()) + + +def test_mimo_tts_seed_text_is_not_prepended_to_assistant_content(): + provider = _make_tts_provider( + { + "mimo-tts-style-prompt": "开心", + "mimo-tts-seed-text": "reference text", + } + ) + try: + payload = provider._build_payload("明天就是周五了") + assert payload["messages"][0]["content"] == "reference text" + assert payload["messages"][1]["content"] == "明天就是周五了" + assert "reference text" not in payload["messages"][1]["content"] finally: asyncio.run(provider.terminate()) @@ -129,7 +196,10 @@ async def fake_post(_url, headers=None, json=None): assert result == "transcribed text" assert captured["json"]["messages"][0]["content"] == "system prompt" assert captured["json"]["messages"][1]["content"][0]["type"] == "input_audio" - assert captured["json"]["messages"][1]["content"][0]["input_audio"]["data"] == "ZmFrZQ==" + assert ( + captured["json"]["messages"][1]["content"][0]["input_audio"]["data"] + == "ZmFrZQ==" + ) assert captured["json"]["messages"][1]["content"][1]["text"] == "user prompt"