Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions astrbot/core/config/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -2442,17 +2442,17 @@ class ChatProviderTemplate(TypedDict):
"mimo-tts-style-prompt": {
"description": "风格提示词",
"type": "string",
"hint": "用于控制生成语音的说话风格、语气或情绪,例如温柔、活泼、沉稳等。可留空。",
"hint": "会以 <style>...</style> 标签形式添加到待合成文本开头,用于控制语速、情绪、角色或风格,例如 开心、变快、孙悟空、悄悄话。可留空。",
},
"mimo-tts-dialect": {
"description": "方言",
"type": "string",
"hint": "指定生成语音时使用的方言或口音,例如四川话、粤语口音等。可留空。",
"hint": "会与风格提示词一起写入开头的 <style>...</style> 标签中,例如 东北话、四川话、河南话、粤语。可留空。",
},
"mimo-tts-seed-text": {
"description": "种子文本",
"type": "string",
"hint": "用于引导音色和说话方式的参考文本,会影响生成语音的表达风格。",
"hint": "作为可选的 user 消息发送,用于辅助调节语气和风格,不会拼接到待合成文本中。",
},
"fishaudio-tts-character": {
"description": "character",
Expand Down
56 changes: 37 additions & 19 deletions astrbot/core/provider/sources/mimo_tts_api_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,35 +44,53 @@ def __init__(
self.set_model(provider_config.get("model", DEFAULT_MIMO_TTS_MODEL))
self.client = create_http_client(self.timeout, self.proxy)

def _build_user_prompt(self) -> str:
prompt_parts: list[str] = []
def _build_user_prompt(self) -> str | None:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The function _build_user_prompt is now returning None in some cases. It would be helpful to add a docstring to explain the function's purpose, return value, and the conditions under which it returns None.

seed_text = self.seed_text.strip()
return seed_text or None
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This line returns the seed_text if it's not empty, otherwise it returns None. Consider adding a comment to clarify this behavior, as it might not be immediately obvious to someone reading the code.

Suggested change
return seed_text or None
return seed_text or None # Returns seed_text if not empty, otherwise None


def _build_style_prefix(self) -> str:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

It would be helpful to add a docstring to explain the purpose of the _build_style_prefix function and what the returned string represents.

style_parts: list[str] = []

if self.style_prompt.strip():
prompt_parts.append(self.style_prompt.strip())
style_parts.append(self.style_prompt.strip())
if self.dialect.strip():
prompt_parts.append(f"Please use {self.dialect.strip()} when speaking.")
style_parts.append(self.dialect.strip())

style_content = " ".join(style_parts).strip()
if not style_content:
return ""

if not prompt_parts:
return self.seed_text
# MiMo recommends using only the singing style tag at the very beginning.
if "唱歌" in style_content:
return "<style>唱歌</style>"
Comment on lines +64 to +65
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This logic handles the special case where the style includes "唱歌". It would be beneficial to add a comment explaining why this special handling is necessary, referencing the MiMo documentation if possible.

Suggested change
if "唱歌" in style_content:
return "<style>唱歌</style>"
# MiMo recommends using only the singing style tag at the very beginning.
if "唱歌" in style_content:
return "<style>唱歌</style>" # Special case for singing style


if self.seed_text.strip():
prompt_parts.append(self.seed_text.strip())
return f"<style>{style_content}</style>"
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Consider adding a comment here to explain that the <style> tag is being constructed and that this is the standard way to apply styles according to the MiMo TTS API.

Suggested change
return f"<style>{style_content}</style>"
# Construct the style tag with the combined style content
return f"<style>{style_content}</style>"


return " ".join(prompt_parts)
def _build_assistant_content(self, text: str) -> str:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

It would be helpful to add a docstring to explain the purpose of the _build_assistant_content function and what the returned string represents.

return f"{self._build_style_prefix()}{text}"

def _build_payload(self, text: str) -> dict:
return {
"model": self.model_name,
"messages": [
messages: list[dict[str, str]] = []

user_prompt = self._build_user_prompt()
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Consider adding a comment to clarify that user_prompt can be None and that the following if statement handles the case where no user prompt is provided.

Suggested change
user_prompt = self._build_user_prompt()
user_prompt = self._build_user_prompt() # Can be None if no seed text is provided
if user_prompt:

if user_prompt:
messages.append(
{
"role": "user",
"content": self._build_user_prompt(),
},
{
"role": "assistant",
"content": text,
},
],
"content": user_prompt,
}
)

messages.append(
{
"role": "assistant",
"content": self._build_assistant_content(text),
}
)

return {
"model": self.model_name,
"messages": messages,
"audio": {
"format": self.audio_format,
"voice": self.voice,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1457,15 +1457,15 @@
},
"mimo-tts-style-prompt": {
"description": "Style prompt",
"hint": "Guides speaking style, tone, or emotion such as gentle, lively, or calm. Optional."
"hint": "Prepended to the synthesis target text as a <style>...</style> tag to control speed, emotion, character, or style, such as happy, faster, Sun Wukong, or whispering. Optional."
},
"mimo-tts-dialect": {
"description": "Dialect",
"hint": "Target dialect or accent for generated speech, such as Sichuan dialect. Optional."
"hint": "Combined with the style prompt inside the leading <style>...</style> tag, for example Northeastern Mandarin, Sichuan dialect, Henan dialect, or Cantonese. Optional."
},
"mimo-tts-seed-text": {
"description": "Seed text",
"hint": "Reference text used to guide voice characteristics and speaking style."
"hint": "Sent as an optional user message to help guide tone and speaking style. It is not appended to the synthesis target text."
},
"fishaudio-tts-character": {
"description": "character",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1454,15 +1454,15 @@
},
"mimo-tts-style-prompt": {
"description": "Подсказка стиля",
"hint": "Задает стиль речи, тон или эмоцию, например мягкий, живой или спокойный. Необязательно."
"hint": "Добавляется в начало синтезируемого текста в виде тега <style>...</style> и управляет скоростью, эмоцией, ролью или манерой речи. Необязательно."
},
"mimo-tts-dialect": {
"description": "Диалект",
"hint": "Диалект или акцент для синтезируемой речи, например сычуаньский диалект. Необязательно."
"hint": "Объединяется с подсказкой стиля внутри начального тега <style>...</style>, например северо-восточный, сычуаньский, хэнаньский или кантонский вариант речи. Необязательно."
},
"mimo-tts-seed-text": {
"description": "Начальный текст",
"hint": "Эталонный текст, который помогает задать особенности голоса и манеру речи."
"hint": "Отправляется как необязательное user-сообщение для настройки тона и манеры речи. Не добавляется к самому тексту синтеза."
},
"fishaudio-tts-character": {
"description": "Персонаж",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1459,15 +1459,15 @@
},
"mimo-tts-style-prompt": {
"description": "风格提示词",
"hint": "用于控制生成语音的说话风格、语气或情绪,例如温柔、活泼、沉稳等。可留空。"
"hint": "会以 <style>...</style> 标签形式添加到待合成文本开头,用于控制语速、情绪、角色或风格,例如 开心、变快、孙悟空、悄悄话。可留空。"
},
"mimo-tts-dialect": {
"description": "方言",
"hint": "指定生成语音时使用的方言或口音,例如四川话、粤语口音等。可留空。"
"hint": "会与风格提示词一起写入开头的 <style>...</style> 标签中,例如 东北话、四川话、河南话、粤语。可留空。"
},
"mimo-tts-seed-text": {
"description": "种子文本",
"hint": "用于引导音色和说话方式的参考文本,会影响生成语音的表达风格。"
"hint": "作为可选的 user 消息发送,用于辅助调节语气和风格,不会拼接到待合成文本中。"
},
"fishaudio-tts-character": {
"description": "character",
Expand Down
90 changes: 80 additions & 10 deletions tests/test_mimo_api_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,29 +35,96 @@ def _make_stt_provider(overrides: dict | None = None) -> ProviderMiMoSTTAPI:
return ProviderMiMoSTTAPI(provider_config=provider_config, provider_settings={})


def test_mimo_tts_prompt_returns_seed_text_when_no_style_or_dialect():
def test_mimo_tts_user_prompt_returns_seed_text():
provider = _make_tts_provider()
try:
assert provider._build_user_prompt() == "seed text"
finally:
asyncio.run(provider.terminate())


def test_mimo_tts_payload_includes_dialect_and_style_prompt():
def test_mimo_tts_assistant_content_prefixes_style_and_dialect():
provider = _make_tts_provider(
{
"mimo-tts-style-prompt": "Please sound cheerful and lively.",
"mimo-tts-dialect": "Sichuan dialect",
"mimo-tts-style-prompt": "开心",
"mimo-tts-dialect": "四川话",
"mimo-tts-seed-text": "You are chatting with a close friend.",
}
)
try:
payload = provider._build_payload("hello")
Comment on lines +46 to 55
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (testing): Strengthen this test to fully validate the assistant message structure and messages length.

Currently this only asserts the full first user message and the content of the second assistant message. To make the regression test more robust for the new protocol, also assert that the second message has role == "assistant" and that len(payload["messages"]) == 2 so future refactors can’t change the ordering or add extra messages unnoticed.

Suggested implementation:

def test_mimo_tts_assistant_content_prefixes_style_and_dialect():
    provider = _make_tts_provider(
        {
            "mimo-tts-style-prompt": "开心",
            "mimo-tts-dialect": "四川话",
            "mimo-tts-seed-text": "You are chatting with a close friend.",
        }
    )
    try:
        # Build the payload used to call the model
        payload = provider._build_payload()

        # Ensure we only ever send the expected two messages
        assert len(payload["messages"]) == 2

        # First message should still be the full user seed text as before
        user_msg = payload["messages"][0]
        assert user_msg["role"] == "user"
        assert user_msg["content"] == "You are chatting with a close friend."

        # Second message should be an assistant message whose content prefixes style and dialect
        assistant_msg = payload["messages"][1]
        assert assistant_msg["role"] == "assistant"
        assert "开心" in assistant_msg["content"]
        assert "四川话" in assistant_msg["content"]

        asyncio.run(provider.terminate())

Because I can only see part of the file, you may need to align the helper names and patterns:

  1. Replace provider._build_payload() with the actual helper you use to construct the chat payload that currently produces payload["messages"] (for example, it might be named _build_chat_request, _build_mimo_payload, etc.).
  2. If the test already builds payload and asserts on the first user message and the assistant content, you should:
    • Keep the existing payload construction.
    • Insert the following assertions around where you currently assert on payload["messages"][1]["content"]:
    assert len(payload["messages"]) == 2
    assert payload["messages"][1]["role"] == "assistant"
    and, if not already present, factor out user_msg/assistant_msg as shown so the test is clearer and more robust.
  3. Ensure the asyncio.run(provider.terminate()) call is not duplicated if there is already a termination call in a finally block or elsewhere in this test.

assert payload["messages"][0]["content"] == (
"Please sound cheerful and lively. "
"Please use Sichuan dialect when speaking. "
"You are chatting with a close friend."
)
assert payload["messages"][0] == {
"role": "user",
"content": "You are chatting with a close friend.",
}
assert payload["messages"][1]["content"] == "<style>开心 四川话</style>hello"
Comment on lines 55 to +60
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The assertion payload["messages"][0] checks the content of the user message. It would be more robust to also assert the role to ensure the message is correctly assigned to the user.

        assert payload["messages"][0]["role"] == "user"
        assert payload["messages"][0]["content"] == "You are chatting with a close friend."
        assert payload["messages"][1]["content"] == "<style>开心 四川话</style>hello"

Comment on lines +56 to +60
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The assertion payload["messages"][1]["content"] checks the content of the assistant message. It would be more robust to also assert the role to ensure the message is correctly assigned to the assistant.

Suggested change
assert payload["messages"][0] == {
"role": "user",
"content": "You are chatting with a close friend.",
}
assert payload["messages"][1]["content"] == "<style>开心 四川话</style>hello"
assert payload["messages"][0] == {
"role": "user",
"content": "You are chatting with a close friend.",
}
assert payload["messages"][1]["role"] == "assistant"
assert payload["messages"][1]["content"] == "<style>开心 四川话</style>hello"

finally:
asyncio.run(provider.terminate())


def test_mimo_tts_payload_omits_user_message_without_seed_text():
provider = _make_tts_provider(
{
"mimo-tts-seed-text": "",
"mimo-tts-style-prompt": "开心",
}
)
try:
payload = provider._build_payload("hello")
assert payload["messages"] == [
{
"role": "assistant",
"content": "<style>开心</style>hello",
}
Comment on lines +74 to +78
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The assertion payload["messages"] checks the entire messages array. It would be more robust to assert the role and content of the assistant message separately to ensure the message is correctly constructed.

        assert len(payload["messages"]) == 1
        assert payload["messages"][0]["role"] == "assistant"
        assert payload["messages"][0]["content"] == "<style>开心</style>hello"

]
finally:
asyncio.run(provider.terminate())


def test_mimo_tts_singing_style_uses_single_style_tag():
provider = _make_tts_provider(
{
"mimo-tts-style-prompt": "唱歌 开心",
"mimo-tts-dialect": "粤语",
}
)
try:
payload = provider._build_payload("歌词")
assert payload["messages"][1]["content"] == "<style>唱歌</style>歌词"
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The assertion payload["messages"][1]["content"] checks the content of the assistant message. It would be more robust to also assert the role to ensure the message is correctly assigned to the assistant.

        assert payload["messages"][1]["role"] == "assistant"
        assert payload["messages"][1]["content"] == "<style>唱歌</style>歌词"

finally:
asyncio.run(provider.terminate())


def test_mimo_tts_plain_text_stays_in_assistant_message_when_no_style():
provider = _make_tts_provider(
{
"mimo-tts-seed-text": "",
}
)
try:
payload = provider._build_payload("hello")
assert payload["messages"] == [
{
"role": "assistant",
"content": "hello",
}
Comment on lines +106 to +110
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The assertion payload["messages"] checks the entire messages array. It would be more robust to assert the role and content of the assistant message separately to ensure the message is correctly constructed.

        assert len(payload["messages"]) == 1
        assert payload["messages"][0]["role"] == "assistant"
        assert payload["messages"][0]["content"] == "hello"

]
finally:
asyncio.run(provider.terminate())


def test_mimo_tts_seed_text_is_not_prepended_to_assistant_content():
provider = _make_tts_provider(
{
"mimo-tts-style-prompt": "开心",
"mimo-tts-seed-text": "reference text",
}
)
try:
payload = provider._build_payload("明天就是周五了")
assert payload["messages"][0]["content"] == "reference text"
assert payload["messages"][1]["content"] == "<style>开心</style>明天就是周五了"
assert "reference text" not in payload["messages"][1]["content"]
Comment on lines +125 to +127
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

These assertions check the content of the user and assistant messages. It would be more robust to also assert the role to ensure the messages are correctly assigned.

        assert payload["messages"][0]["role"] == "user"
        assert payload["messages"][0]["content"] == "reference text"
        assert payload["messages"][1]["role"] == "assistant"
        assert payload["messages"][1]["content"] == "<style>开心</style>明天就是周五了"
        assert "reference text" not in payload["messages"][1]["content"]

finally:
asyncio.run(provider.terminate())

Expand Down Expand Up @@ -129,7 +196,10 @@ async def fake_post(_url, headers=None, json=None):
assert result == "transcribed text"
assert captured["json"]["messages"][0]["content"] == "system prompt"
assert captured["json"]["messages"][1]["content"][0]["type"] == "input_audio"
assert captured["json"]["messages"][1]["content"][0]["input_audio"]["data"] == "ZmFrZQ=="
assert (
captured["json"]["messages"][1]["content"][0]["input_audio"]["data"]
== "ZmFrZQ=="
)
assert captured["json"]["messages"][1]["content"][1]["text"] == "user prompt"


Expand Down
Loading