Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions astrbot/core/config/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -1431,6 +1431,20 @@ class ChatProviderTemplate(TypedDict):
"model": "whisper-1",
"proxy": "",
},
"MiMo STT(API)": {
"id": "mimo_stt",
"provider": "mimo",
"type": "mimo_stt_api",
"provider_type": "speech_to_text",
"enable": False,
"api_key": "",
"api_base": "https://api.xiaomimimo.com/v1",
"model": "mimo-v2-omni",
"mimo-stt-system-prompt": "You are a speech transcription assistant. Transcribe the spoken content from the audio exactly and return only the transcription text.",
"mimo-stt-user-prompt": "Please transcribe the content of the audio and return only the transcription text.",
"timeout": "20",
"proxy": "",
},
"Whisper(Local)": {
"provider": "openai",
"type": "openai_whisper_selfhost",
Expand Down Expand Up @@ -1461,6 +1475,23 @@ class ChatProviderTemplate(TypedDict):
"timeout": "20",
"proxy": "",
},
"MiMo TTS(API)": {
"id": "mimo_tts",
"type": "mimo_tts_api",
"provider": "mimo",
"provider_type": "text_to_speech",
"enable": False,
"api_key": "",
"api_base": "https://api.xiaomimimo.com/v1",
"model": "mimo-v2-tts",
"mimo-tts-voice": "mimo_default",
"mimo-tts-format": "wav",
"mimo-tts-style-prompt": "",
"mimo-tts-dialect": "",
"mimo-tts-seed-text": "Hello, MiMo, have you had lunch?",
"timeout": "20",
"proxy": "",
},
"Genie TTS": {
"id": "genie_tts",
"provider": "genie_tts",
Expand Down Expand Up @@ -2309,11 +2340,46 @@ class ChatProviderTemplate(TypedDict):
"type": "int",
"hint": "超时时间,单位为秒。",
},
"mimo-stt-system-prompt": {
"description": "系统提示词",
"type": "string",
"hint": "用于指导 MiMo STT 转录行为的 system prompt。",
},
"mimo-stt-user-prompt": {
"description": "用户提示词",
"type": "string",
"hint": "附加给 MiMo STT 的用户提示词,用于约束返回结果格式。",
},
"openai-tts-voice": {
"description": "voice",
"type": "string",
"hint": "OpenAI TTS 的声音。OpenAI 默认支持:'alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'",
},
"mimo-tts-voice": {
"description": "音色",
"type": "string",
"hint": "MiMo TTS 的音色名称。可选值包括 'mimo_default'、'default_en'、'default_zh'。",
},
"mimo-tts-format": {
"description": "输出格式",
"type": "string",
"hint": "MiMo TTS 生成音频的格式。支持 'wav'、'mp3'、'pcm'。",
},
"mimo-tts-style-prompt": {
"description": "风格提示词",
"type": "string",
"hint": "用于控制生成语音的说话风格、语气或情绪,例如温柔、活泼、沉稳等。可留空。",
},
"mimo-tts-dialect": {
"description": "方言",
"type": "string",
"hint": "指定生成语音时使用的方言或口音,例如四川话、粤语口音等。可留空。",
},
"mimo-tts-seed-text": {
"description": "种子文本",
"type": "string",
"hint": "用于引导音色和说话方式的参考文本,会影响生成语音的表达风格。",
},
"fishaudio-tts-character": {
"description": "character",
"type": "string",
Expand Down
8 changes: 8 additions & 0 deletions astrbot/core/provider/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,10 @@ def dynamic_import_provider(self, type: str) -> None:
from .sources.whisper_api_source import (
ProviderOpenAIWhisperAPI as ProviderOpenAIWhisperAPI,
)
case "mimo_stt_api":
from .sources.mimo_stt_api_source import (
ProviderMiMoSTTAPI as ProviderMiMoSTTAPI,
)
case "openai_whisper_selfhost":
from .sources.whisper_selfhosted_source import (
ProviderOpenAIWhisperSelfHost as ProviderOpenAIWhisperSelfHost,
Expand All @@ -399,6 +403,10 @@ def dynamic_import_provider(self, type: str) -> None:
from .sources.openai_tts_api_source import (
ProviderOpenAITTSAPI as ProviderOpenAITTSAPI,
)
case "mimo_tts_api":
from .sources.mimo_tts_api_source import (
ProviderMiMoTTSAPI as ProviderMiMoTTSAPI,
)
case "genie_tts":
from .sources.genie_tts import (
GenieTTSProvider as GenieTTSProvider,
Expand Down
129 changes: 129 additions & 0 deletions astrbot/core/provider/sources/mimo_api_common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import base64
import uuid
from pathlib import Path
from urllib.parse import urlparse

import httpx

from astrbot import logger
from astrbot.core.utils.astrbot_path import get_astrbot_temp_path
from astrbot.core.utils.io import download_file
from astrbot.core.utils.tencent_record_helper import (
convert_to_pcm_wav,
tencent_silk_to_wav,
)

DEFAULT_MIMO_API_BASE = "https://api.xiaomimimo.com/v1"
DEFAULT_MIMO_TTS_MODEL = "mimo-v2-tts"
DEFAULT_MIMO_TTS_VOICE = "mimo_default"
DEFAULT_MIMO_TTS_SEED_TEXT = "Hello, MiMo, have you had lunch?"
DEFAULT_MIMO_STT_MODEL = "mimo-v2-omni"
DEFAULT_MIMO_STT_SYSTEM_PROMPT = (
"You are a speech transcription assistant. "
"Transcribe the spoken content from the audio exactly and return only the transcription text."
)
DEFAULT_MIMO_STT_USER_PROMPT = (
"Please transcribe the content of the audio and return only the transcription text."
)


class MiMoAPIError(Exception):
pass


def normalize_timeout(timeout: int | str | None) -> int | None:
if timeout in (None, ""):
return None
if isinstance(timeout, str):
return int(timeout)
return timeout


def build_headers(api_key: str) -> dict[str, str]:
headers = {"Content-Type": "application/json"}
if api_key:
headers["Authorization"] = f"Bearer {api_key}"
return headers


def get_temp_dir() -> Path:
temp_dir = Path(get_astrbot_temp_path())
temp_dir.mkdir(parents=True, exist_ok=True)
return temp_dir


def create_http_client(timeout: int | None, proxy: str) -> httpx.AsyncClient:
client_kwargs: dict[str, object] = {
"timeout": timeout,
"follow_redirects": True,
}
if proxy:
logger.info("[MiMo API] Using proxy: %s", proxy)
client_kwargs["proxy"] = proxy
return httpx.AsyncClient(**client_kwargs)


def build_api_url(api_base: str) -> str:
normalized_api_base = api_base.rstrip("/")
if normalized_api_base.endswith("/chat/completions"):
return normalized_api_base
return normalized_api_base + "/chat/completions"


async def _detect_audio_format(file_path: Path) -> str | None:
silk_header = b"SILK"
amr_header = b"#!AMR"

try:
with file_path.open("rb") as file:
file_header = file.read(8)
except FileNotFoundError:
return None

if silk_header in file_header:
return "silk"
if amr_header in file_header:
return "amr"
return None


async def prepare_audio_input(audio_source: str) -> tuple[str, list[Path]]:
cleanup_paths: list[Path] = []
source_path = Path(audio_source)
is_remote = audio_source.startswith(("http://", "https://"))
is_tencent = "multimedia.nt.qq.com.cn" in audio_source if is_remote else False

if is_remote:
parsed_url = urlparse(audio_source)
suffix = Path(parsed_url.path).suffix or ".input"
download_path = get_temp_dir() / f"mimo_audio_{uuid.uuid4().hex[:8]}{suffix}"
await download_file(audio_source, str(download_path))
source_path = download_path
cleanup_paths.append(download_path)

if not source_path.exists():
raise FileNotFoundError(f"File does not exist: {source_path}")

if source_path.suffix.lower() in {".amr", ".silk"} or is_tencent:
file_format = await _detect_audio_format(source_path)
if file_format in {"silk", "amr"}:
converted_path = get_temp_dir() / f"mimo_audio_{uuid.uuid4().hex[:8]}.wav"
cleanup_paths.append(converted_path)
if file_format == "silk":
logger.info("Converting silk file to wav for MiMo STT...")
await tencent_silk_to_wav(str(source_path), str(converted_path))
else:
logger.info("Converting amr file to wav for MiMo STT...")
await convert_to_pcm_wav(str(source_path), str(converted_path))
source_path = converted_path

encoded_audio = base64.b64encode(source_path.read_bytes()).decode("utf-8")
return encoded_audio, cleanup_paths


def cleanup_files(paths: list[Path]) -> None:
for path in paths:
try:
path.unlink(missing_ok=True)
except Exception as exc:
logger.warning("Failed to remove temporary MiMo file %s: %s", path, exc)
100 changes: 100 additions & 0 deletions astrbot/core/provider/sources/mimo_stt_api_source.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
from ..entities import ProviderType
from ..provider import STTProvider
from ..register import register_provider_adapter
from .mimo_api_common import (
DEFAULT_MIMO_API_BASE,
DEFAULT_MIMO_STT_MODEL,
DEFAULT_MIMO_STT_SYSTEM_PROMPT,
DEFAULT_MIMO_STT_USER_PROMPT,
MiMoAPIError,
build_api_url,
build_headers,
cleanup_files,
create_http_client,
normalize_timeout,
prepare_audio_input,
)


@register_provider_adapter(
"mimo_stt_api",
"MiMo STT API",
provider_type=ProviderType.SPEECH_TO_TEXT,
)
class ProviderMiMoSTTAPI(STTProvider):
def __init__(
self,
provider_config: dict,
provider_settings: dict,
) -> None:
super().__init__(provider_config, provider_settings)
self.chosen_api_key = provider_config.get("api_key", "")
self.api_base = provider_config.get("api_base", DEFAULT_MIMO_API_BASE)
self.proxy = provider_config.get("proxy", "")
self.timeout = normalize_timeout(provider_config.get("timeout", 20))
self.system_prompt = provider_config.get(
"mimo-stt-system-prompt",
DEFAULT_MIMO_STT_SYSTEM_PROMPT,
)
self.user_prompt = provider_config.get(
"mimo-stt-user-prompt",
DEFAULT_MIMO_STT_USER_PROMPT,
)
self.set_model(provider_config.get("model", DEFAULT_MIMO_STT_MODEL))
self.client = create_http_client(self.timeout, self.proxy)

async def get_text(self, audio_url: str) -> str:
audio_data_url, cleanup_paths = await prepare_audio_input(audio_url)
payload = {
"model": self.model_name,
"messages": [
{
"role": "system",
"content": self.system_prompt,
},
{
"role": "user",
"content": [
{
"type": "input_audio",
"input_audio": {
"data": audio_data_url,
},
},
{
"type": "text",
"text": self.user_prompt,
},
],
},
],
"max_completion_tokens": 1024,
}

try:
response = await self.client.post(
build_api_url(self.api_base),
headers=build_headers(self.chosen_api_key),
json=payload,
)
try:
response.raise_for_status()
except Exception as exc:
error_text = response.text[:1024]
raise MiMoAPIError(
f"MiMo STT API request failed: HTTP {response.status_code}, response: {error_text}"
) from exc

data = response.json()
choices = data.get("choices") or []
first_choice = choices[0] if choices else {}
content = first_choice.get("message", {}).get("content", "")
if not isinstance(content, str) or not content.strip():
raise MiMoAPIError(f"MiMo STT API returned empty transcription: {data}")
return content.strip()
finally:
cleanup_files(cleanup_paths)

async def terminate(self):
if self.client:
await self.client.aclose()
Loading