AstrBotDevs · Soulter · Mar 20, 2026 · Mar 19, 2026 · Mar 19, 2026 · Mar 19, 2026
diff --git a/astrbot/core/config/default.py b/astrbot/core/config/default.py
@@ -1431,6 +1431,20 @@ class ChatProviderTemplate(TypedDict):
                         "model": "whisper-1",
                         "proxy": "",
                     },
+                    "MiMo STT(API)": {
+                        "id": "mimo_stt",
+                        "provider": "mimo",
+                        "type": "mimo_stt_api",
+                        "provider_type": "speech_to_text",
+                        "enable": False,
+                        "api_key": "",
+                        "api_base": "https://api.xiaomimimo.com/v1",
+                        "model": "mimo-v2-omni",
+                        "mimo-stt-system-prompt": "You are a speech transcription assistant. Transcribe the spoken content from the audio exactly and return only the transcription text.",
+                        "mimo-stt-user-prompt": "Please transcribe the content of the audio and return only the transcription text.",
+                        "timeout": "20",
+                        "proxy": "",
+                    },
                     "Whisper(Local)": {
                         "provider": "openai",
                         "type": "openai_whisper_selfhost",
@@ -1461,6 +1475,23 @@ class ChatProviderTemplate(TypedDict):
                         "timeout": "20",
                         "proxy": "",
                     },
+                    "MiMo TTS(API)": {
+                        "id": "mimo_tts",
+                        "type": "mimo_tts_api",
+                        "provider": "mimo",
+                        "provider_type": "text_to_speech",
+                        "enable": False,
+                        "api_key": "",
+                        "api_base": "https://api.xiaomimimo.com/v1",
+                        "model": "mimo-v2-tts",
+                        "mimo-tts-voice": "mimo_default",
+                        "mimo-tts-format": "wav",
+                        "mimo-tts-style-prompt": "",
+                        "mimo-tts-dialect": "",
+                        "mimo-tts-seed-text": "Hello, MiMo, have you had lunch?",
+                        "timeout": "20",
+                        "proxy": "",
+                    },
                     "Genie TTS": {
                         "id": "genie_tts",
                         "provider": "genie_tts",
@@ -2309,11 +2340,46 @@ class ChatProviderTemplate(TypedDict):
                         "type": "int",
                         "hint": "超时时间，单位为秒。",
                     },
+                    "mimo-stt-system-prompt": {
+                        "description": "系统提示词",
+                        "type": "string",
+                        "hint": "用于指导 MiMo STT 转录行为的 system prompt。",
+                    },
+                    "mimo-stt-user-prompt": {
+                        "description": "用户提示词",
+                        "type": "string",
+                        "hint": "附加给 MiMo STT 的用户提示词，用于约束返回结果格式。",
+                    },
                     "openai-tts-voice": {
                         "description": "voice",
                         "type": "string",
                         "hint": "OpenAI TTS 的声音。OpenAI 默认支持：'alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'",
                     },
+                    "mimo-tts-voice": {
+                        "description": "音色",
+                        "type": "string",
+                        "hint": "MiMo TTS 的音色名称。可选值包括 'mimo_default'、'default_en'、'default_zh'。",
+                    },
+                    "mimo-tts-format": {
+                        "description": "输出格式",
+                        "type": "string",
+                        "hint": "MiMo TTS 生成音频的格式。支持 'wav'、'mp3'、'pcm'。",
+                    },
+                    "mimo-tts-style-prompt": {
+                        "description": "风格提示词",
+                        "type": "string",
+                        "hint": "用于控制生成语音的说话风格、语气或情绪，例如温柔、活泼、沉稳等。可留空。",
+                    },
+                    "mimo-tts-dialect": {
+                        "description": "方言",
+                        "type": "string",
+                        "hint": "指定生成语音时使用的方言或口音，例如四川话、粤语口音等。可留空。",
+                    },
+                    "mimo-tts-seed-text": {
+                        "description": "种子文本",
+                        "type": "string",
+                        "hint": "用于引导音色和说话方式的参考文本，会影响生成语音的表达风格。",
+                    },
                     "fishaudio-tts-character": {
                         "description": "character",
                         "type": "string",

diff --git a/astrbot/core/provider/manager.py b/astrbot/core/provider/manager.py
@@ -387,6 +387,10 @@ def dynamic_import_provider(self, type: str) -> None:
                 from .sources.whisper_api_source import (
                     ProviderOpenAIWhisperAPI as ProviderOpenAIWhisperAPI,
                 )
+            case "mimo_stt_api":
+                from .sources.mimo_stt_api_source import (
+                    ProviderMiMoSTTAPI as ProviderMiMoSTTAPI,
+                )
             case "openai_whisper_selfhost":
                 from .sources.whisper_selfhosted_source import (
                     ProviderOpenAIWhisperSelfHost as ProviderOpenAIWhisperSelfHost,
@@ -399,6 +403,10 @@ def dynamic_import_provider(self, type: str) -> None:
                 from .sources.openai_tts_api_source import (
                     ProviderOpenAITTSAPI as ProviderOpenAITTSAPI,
                 )
+            case "mimo_tts_api":
+                from .sources.mimo_tts_api_source import (
+                    ProviderMiMoTTSAPI as ProviderMiMoTTSAPI,
+                )
             case "genie_tts":
                 from .sources.genie_tts import (
                     GenieTTSProvider as GenieTTSProvider,

diff --git a/astrbot/core/provider/sources/mimo_api_common.py b/astrbot/core/provider/sources/mimo_api_common.py
@@ -0,0 +1,129 @@
+import base64
+import uuid
+from pathlib import Path
+from urllib.parse import urlparse
+
+import httpx
+
+from astrbot import logger
+from astrbot.core.utils.astrbot_path import get_astrbot_temp_path
+from astrbot.core.utils.io import download_file
+from astrbot.core.utils.tencent_record_helper import (
+    convert_to_pcm_wav,
+    tencent_silk_to_wav,
+)
+
+DEFAULT_MIMO_API_BASE = "https://api.xiaomimimo.com/v1"
+DEFAULT_MIMO_TTS_MODEL = "mimo-v2-tts"
+DEFAULT_MIMO_TTS_VOICE = "mimo_default"
+DEFAULT_MIMO_TTS_SEED_TEXT = "Hello, MiMo, have you had lunch?"
+DEFAULT_MIMO_STT_MODEL = "mimo-v2-omni"
+DEFAULT_MIMO_STT_SYSTEM_PROMPT = (
+    "You are a speech transcription assistant. "
+    "Transcribe the spoken content from the audio exactly and return only the transcription text."
+)
+DEFAULT_MIMO_STT_USER_PROMPT = (
+    "Please transcribe the content of the audio and return only the transcription text."
+)
+
+
+class MiMoAPIError(Exception):
+    pass
+
+
+def normalize_timeout(timeout: int | str | None) -> int | None:
+    if timeout in (None, ""):
+        return None
+    if isinstance(timeout, str):
+        return int(timeout)
+    return timeout
+
+
+def build_headers(api_key: str) -> dict[str, str]:
+    headers = {"Content-Type": "application/json"}
+    if api_key:
+        headers["Authorization"] = f"Bearer {api_key}"
+    return headers
+
+
+def get_temp_dir() -> Path:
+    temp_dir = Path(get_astrbot_temp_path())
+    temp_dir.mkdir(parents=True, exist_ok=True)
+    return temp_dir
+
+
+def create_http_client(timeout: int | None, proxy: str) -> httpx.AsyncClient:
+    client_kwargs: dict[str, object] = {
+        "timeout": timeout,
+        "follow_redirects": True,
+    }
+    if proxy:
+        logger.info("[MiMo API] Using proxy: %s", proxy)
+        client_kwargs["proxy"] = proxy
+    return httpx.AsyncClient(**client_kwargs)
+
+
+def build_api_url(api_base: str) -> str:
+    normalized_api_base = api_base.rstrip("/")
+    if normalized_api_base.endswith("/chat/completions"):
+        return normalized_api_base
+    return normalized_api_base + "/chat/completions"
+
+
+async def _detect_audio_format(file_path: Path) -> str | None:
+    silk_header = b"SILK"
+    amr_header = b"#!AMR"
+
+    try:
+        with file_path.open("rb") as file:
+            file_header = file.read(8)
+    except FileNotFoundError:
+        return None
+
+    if silk_header in file_header:
+        return "silk"
+    if amr_header in file_header:
+        return "amr"
+    return None
+
+
+async def prepare_audio_input(audio_source: str) -> tuple[str, list[Path]]:
+    cleanup_paths: list[Path] = []
+    source_path = Path(audio_source)
+    is_remote = audio_source.startswith(("http://", "https://"))
+    is_tencent = "multimedia.nt.qq.com.cn" in audio_source if is_remote else False
+
+    if is_remote:
+        parsed_url = urlparse(audio_source)
+        suffix = Path(parsed_url.path).suffix or ".input"
+        download_path = get_temp_dir() / f"mimo_audio_{uuid.uuid4().hex[:8]}{suffix}"
+        await download_file(audio_source, str(download_path))
+        source_path = download_path
+        cleanup_paths.append(download_path)
+
+    if not source_path.exists():
+        raise FileNotFoundError(f"File does not exist: {source_path}")
+
+    if source_path.suffix.lower() in {".amr", ".silk"} or is_tencent:
+        file_format = await _detect_audio_format(source_path)
+        if file_format in {"silk", "amr"}:
+            converted_path = get_temp_dir() / f"mimo_audio_{uuid.uuid4().hex[:8]}.wav"
+            cleanup_paths.append(converted_path)
+            if file_format == "silk":
+                logger.info("Converting silk file to wav for MiMo STT...")
+                await tencent_silk_to_wav(str(source_path), str(converted_path))
+            else:
+                logger.info("Converting amr file to wav for MiMo STT...")
+                await convert_to_pcm_wav(str(source_path), str(converted_path))
+            source_path = converted_path
+
+    encoded_audio = base64.b64encode(source_path.read_bytes()).decode("utf-8")
+    return encoded_audio, cleanup_paths
+
+
+def cleanup_files(paths: list[Path]) -> None:
+    for path in paths:
+        try:
+            path.unlink(missing_ok=True)
+        except Exception as exc:
+            logger.warning("Failed to remove temporary MiMo file %s: %s", path, exc)
diff --git a/astrbot/core/provider/sources/mimo_stt_api_source.py b/astrbot/core/provider/sources/mimo_stt_api_source.py
@@ -0,0 +1,100 @@
+from ..entities import ProviderType
+from ..provider import STTProvider
+from ..register import register_provider_adapter
+from .mimo_api_common import (
+    DEFAULT_MIMO_API_BASE,
+    DEFAULT_MIMO_STT_MODEL,
+    DEFAULT_MIMO_STT_SYSTEM_PROMPT,
+    DEFAULT_MIMO_STT_USER_PROMPT,
+    MiMoAPIError,
+    build_api_url,
+    build_headers,
+    cleanup_files,
+    create_http_client,
+    normalize_timeout,
+    prepare_audio_input,
+)
+
+
+@register_provider_adapter(
+    "mimo_stt_api",
+    "MiMo STT API",
+    provider_type=ProviderType.SPEECH_TO_TEXT,
+)
+class ProviderMiMoSTTAPI(STTProvider):
+    def __init__(
+        self,
+        provider_config: dict,
+        provider_settings: dict,
+    ) -> None:
+        super().__init__(provider_config, provider_settings)
+        self.chosen_api_key = provider_config.get("api_key", "")
+        self.api_base = provider_config.get("api_base", DEFAULT_MIMO_API_BASE)
+        self.proxy = provider_config.get("proxy", "")
+        self.timeout = normalize_timeout(provider_config.get("timeout", 20))
+        self.system_prompt = provider_config.get(
+            "mimo-stt-system-prompt",
+            DEFAULT_MIMO_STT_SYSTEM_PROMPT,
+        )
+        self.user_prompt = provider_config.get(
+            "mimo-stt-user-prompt",
+            DEFAULT_MIMO_STT_USER_PROMPT,
+        )
+        self.set_model(provider_config.get("model", DEFAULT_MIMO_STT_MODEL))
+        self.client = create_http_client(self.timeout, self.proxy)
+
+    async def get_text(self, audio_url: str) -> str:
+        audio_data_url, cleanup_paths = await prepare_audio_input(audio_url)
+        payload = {
+            "model": self.model_name,
+            "messages": [
+                {
+                    "role": "system",
+                    "content": self.system_prompt,
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "input_audio",
+                            "input_audio": {
+                                "data": audio_data_url,
+                            },
+                        },
+                        {
+                            "type": "text",
+                            "text": self.user_prompt,
+                        },
+                    ],
+                },
+            ],
+            "max_completion_tokens": 1024,
+        }
+
+        try:
+            response = await self.client.post(
+                build_api_url(self.api_base),
+                headers=build_headers(self.chosen_api_key),
+                json=payload,
+            )
+            try:
+                response.raise_for_status()
+            except Exception as exc:
+                error_text = response.text[:1024]
+                raise MiMoAPIError(
+                    f"MiMo STT API request failed: HTTP {response.status_code}, response: {error_text}"
+                ) from exc
+
+            data = response.json()
+            choices = data.get("choices") or []
+            first_choice = choices[0] if choices else {}
+            content = first_choice.get("message", {}).get("content", "")
+            if not isinstance(content, str) or not content.strip():
+                raise MiMoAPIError(f"MiMo STT API returned empty transcription: {data}")
+            return content.strip()
+        finally:
+            cleanup_files(cleanup_paths)
+
+    async def terminate(self):
+        if self.client:
+            await self.client.aclose()