diff --git a/Cargo.lock b/Cargo.lock
index 027a6432e..1846f49df 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -621,6 +621,17 @@ version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0"
+[[package]]
+name = "audiopus_sys"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62314a1546a2064e033665d658e88c620a62904be945f8147e6b16c3db9f8651"
+dependencies = [
+ "cmake",
+ "log",
+ "pkg-config",
+]
+
[[package]]
name = "autocfg"
version = "1.5.0"
@@ -776,6 +787,26 @@ dependencies = [
"num-traits",
]
+[[package]]
+name = "bindgen"
+version = "0.71.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f58bf3d7db68cfbac37cfc485a8d711e87e064c3d0fe0435b92f7a407f9d6b3"
+dependencies = [
+ "bitflags 2.10.0",
+ "cexpr",
+ "clang-sys",
+ "itertools 0.13.0",
+ "log",
+ "prettyplease",
+ "proc-macro2",
+ "quote",
+ "regex",
+ "rustc-hash",
+ "shlex",
+ "syn 2.0.114",
+]
+
[[package]]
name = "bit_field"
version = "0.10.3"
@@ -1105,6 +1136,15 @@ version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4f4c707c6a209cbe82d10abd08e1ea8995e9ea937d2550646e02798948992be0"
+[[package]]
+name = "cexpr"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
+dependencies = [
+ "nom 7.1.3",
+]
+
[[package]]
name = "cff-parser"
version = "0.1.0"
@@ -1223,6 +1263,17 @@ dependencies = [
"inout",
]
+[[package]]
+name = "clang-sys"
+version = "1.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
+dependencies = [
+ "glob",
+ "libc",
+ "libloading",
+]
+
[[package]]
name = "clap"
version = "4.5.58"
@@ -1377,6 +1428,19 @@ dependencies = [
"windows-sys 0.59.0",
]
+[[package]]
+name = "console"
+version = "0.16.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "03e45a4a8926227e4197636ba97a9fc9b00477e9f4bd711395687c5f0734bec4"
+dependencies = [
+ "encode_unicode",
+ "libc",
+ "once_cell",
+ "unicode-width",
+ "windows-sys 0.61.2",
+]
+
[[package]]
name = "const-oid"
version = "0.9.6"
@@ -1433,6 +1497,35 @@ dependencies = [
"unicode-segmentation",
]
+[[package]]
+name = "cookie"
+version = "0.18.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ddef33a339a91ea89fb53151bd0a4689cfce27055c291dfa69945475d22c747"
+dependencies = [
+ "percent-encoding",
+ "time",
+ "version_check",
+]
+
+[[package]]
+name = "cookie_store"
+version = "0.22.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "15b2c103cf610ec6cae3da84a766285b42fd16aad564758459e6ecf128c75206"
+dependencies = [
+ "cookie",
+ "document-features",
+ "idna",
+ "indexmap 2.13.0",
+ "log",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "time",
+ "url",
+]
+
[[package]]
name = "core-foundation"
version = "0.9.4"
@@ -2481,7 +2574,7 @@ version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "658bce805d770f407bc62102fca7c2c64ceef2fbcb2b8bd19d2765ce093980de"
dependencies = [
- "console",
+ "console 0.15.11",
"shell-words",
"tempfile",
"thiserror 1.0.69",
@@ -2553,6 +2646,15 @@ dependencies = [
"const-random",
]
+[[package]]
+name = "document-features"
+version = "0.2.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d4b8a88685455ed29a21542a33abd9cb6510b6b129abadabdcef0f4c55bc8f61"
+dependencies = [
+ "litrs",
+]
+
[[package]]
name = "dotenvy"
version = "0.15.7"
@@ -2793,6 +2895,12 @@ dependencies = [
"zune-inflate",
]
+[[package]]
+name = "extended"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af9673d8203fcb076b19dfd17e38b3d4ae9f44959416ea532ce72415a6020365"
+
[[package]]
name = "fast-float2"
version = "0.2.3"
@@ -2812,7 +2920,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04c269a76bfc6cea69553b7d040acb16c793119cebd97c756d21e08d0f075ff8"
dependencies = [
"anyhow",
- "hf-hub",
+ "hf-hub 0.4.3",
"image",
"ndarray",
"ort",
@@ -3470,7 +3578,7 @@ checksum = "629d8f3bbeda9d148036d6b0de0a3ab947abd08ce90626327fc3547a49d59d97"
dependencies = [
"dirs",
"http",
- "indicatif",
+ "indicatif 0.17.11",
"libc",
"log",
"native-tls",
@@ -3479,10 +3587,34 @@ dependencies = [
"serde",
"serde_json",
"thiserror 2.0.18",
- "ureq",
+ "ureq 2.12.1",
"windows-sys 0.60.2",
]
+[[package]]
+name = "hf-hub"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aef3982638978efa195ff11b305f51f1f22f4f0a6cabee7af79b383ebee6a213"
+dependencies = [
+ "dirs",
+ "futures",
+ "http",
+ "indicatif 0.18.4",
+ "libc",
+ "log",
+ "native-tls",
+ "num_cpus",
+ "rand 0.9.2",
+ "reqwest",
+ "serde",
+ "serde_json",
+ "thiserror 2.0.18",
+ "tokio",
+ "ureq 3.2.0",
+ "windows-sys 0.61.2",
+]
+
[[package]]
name = "hkdf"
version = "0.12.4"
@@ -3986,13 +4118,26 @@ version = "0.17.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235"
dependencies = [
- "console",
+ "console 0.15.11",
"number_prefix",
"portable-atomic",
"unicode-width",
"web-time",
]
+[[package]]
+name = "indicatif"
+version = "0.18.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "25470f23803092da7d239834776d653104d551bc4d7eacaf31e6837854b8e9eb"
+dependencies = [
+ "console 0.16.2",
+ "portable-atomic",
+ "unicode-width",
+ "unit-prefix",
+ "web-time",
+]
+
[[package]]
name = "indoc"
version = "2.0.7"
@@ -4915,6 +5060,16 @@ dependencies = [
"cc",
]
+[[package]]
+name = "libloading"
+version = "0.8.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55"
+dependencies = [
+ "cfg-if",
+ "windows-link 0.2.1",
+]
+
[[package]]
name = "libm"
version = "0.2.16"
@@ -4961,6 +5116,12 @@ version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77"
+[[package]]
+name = "litrs"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "11d3d7f243d5c5a8b9bb5d6dd2b1602c0cb0b9db1621bafc7ed66e35ff9fe092"
+
[[package]]
name = "lock_api"
version = "0.4.14"
@@ -5650,6 +5811,15 @@ dependencies = [
"web-time",
]
+[[package]]
+name = "ogg"
+version = "0.9.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fdab8dcd8d4052eaacaf8fb07a3ccd9a6e26efadb42878a413c68fc4af1dee2b"
+dependencies = [
+ "byteorder",
+]
+
[[package]]
name = "once_cell"
version = "1.21.3"
@@ -5846,6 +6016,15 @@ version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d"
+[[package]]
+name = "opus"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4d3809943dff6fbad5f0484449ea26bdb9cb7d8efdf26ed50d3c7f227f69eb5c"
+dependencies = [
+ "audiopus_sys",
+]
+
[[package]]
name = "ordered-float"
version = "5.1.0"
@@ -5886,7 +6065,7 @@ dependencies = [
"pkg-config",
"sha2",
"tar",
- "ureq",
+ "ureq 2.12.1",
]
[[package]]
@@ -7873,6 +8052,7 @@ dependencies = [
"dirs",
"fastembed",
"futures",
+ "hf-hub 0.5.0",
"ignore",
"indoc",
"lance-index",
@@ -7881,11 +8061,13 @@ dependencies = [
"mime_guess",
"minijinja",
"notify",
+ "ogg",
"open",
"opentelemetry",
"opentelemetry-otlp",
"opentelemetry-semantic-conventions",
"opentelemetry_sdk",
+ "opus",
"pdf-extract",
"pin-project",
"prometheus",
@@ -7904,6 +8086,7 @@ dependencies = [
"sha2",
"slack-morphism",
"sqlx",
+ "symphonia",
"teloxide",
"tempfile",
"thiserror 2.0.18",
@@ -7920,6 +8103,7 @@ dependencies = [
"twitch-irc",
"urlencoding",
"uuid",
+ "whisper-rs",
"zip",
]
@@ -8271,6 +8455,178 @@ version = "2.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
+[[package]]
+name = "symphonia"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5773a4c030a19d9bfaa090f49746ff35c75dfddfa700df7a5939d5e076a57039"
+dependencies = [
+ "lazy_static",
+ "symphonia-bundle-flac",
+ "symphonia-bundle-mp3",
+ "symphonia-codec-aac",
+ "symphonia-codec-adpcm",
+ "symphonia-codec-pcm",
+ "symphonia-codec-vorbis",
+ "symphonia-core",
+ "symphonia-format-isomp4",
+ "symphonia-format-mkv",
+ "symphonia-format-ogg",
+ "symphonia-format-riff",
+ "symphonia-metadata",
+]
+
+[[package]]
+name = "symphonia-bundle-flac"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c91565e180aea25d9b80a910c546802526ffd0072d0b8974e3ebe59b686c9976"
+dependencies = [
+ "log",
+ "symphonia-core",
+ "symphonia-metadata",
+ "symphonia-utils-xiph",
+]
+
+[[package]]
+name = "symphonia-bundle-mp3"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4872dd6bb56bf5eac799e3e957aa1981086c3e613b27e0ac23b176054f7c57ed"
+dependencies = [
+ "lazy_static",
+ "log",
+ "symphonia-core",
+ "symphonia-metadata",
+]
+
+[[package]]
+name = "symphonia-codec-aac"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c263845aa86881416849c1729a54c7f55164f8b96111dba59de46849e73a790"
+dependencies = [
+ "lazy_static",
+ "log",
+ "symphonia-core",
+]
+
+[[package]]
+name = "symphonia-codec-adpcm"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2dddc50e2bbea4cfe027441eece77c46b9f319748605ab8f3443350129ddd07f"
+dependencies = [
+ "log",
+ "symphonia-core",
+]
+
+[[package]]
+name = "symphonia-codec-pcm"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4e89d716c01541ad3ebe7c91ce4c8d38a7cf266a3f7b2f090b108fb0cb031d95"
+dependencies = [
+ "log",
+ "symphonia-core",
+]
+
+[[package]]
+name = "symphonia-codec-vorbis"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f025837c309cd69ffef572750b4a2257b59552c5399a5e49707cc5b1b85d1c73"
+dependencies = [
+ "log",
+ "symphonia-core",
+ "symphonia-utils-xiph",
+]
+
+[[package]]
+name = "symphonia-core"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ea00cc4f79b7f6bb7ff87eddc065a1066f3a43fe1875979056672c9ef948c2af"
+dependencies = [
+ "arrayvec",
+ "bitflags 1.3.2",
+ "bytemuck",
+ "lazy_static",
+ "log",
+]
+
+[[package]]
+name = "symphonia-format-isomp4"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "243739585d11f81daf8dac8d9f3d18cc7898f6c09a259675fc364b382c30e0a5"
+dependencies = [
+ "encoding_rs",
+ "log",
+ "symphonia-core",
+ "symphonia-metadata",
+ "symphonia-utils-xiph",
+]
+
+[[package]]
+name = "symphonia-format-mkv"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "122d786d2c43a49beb6f397551b4a050d8229eaa54c7ddf9ee4b98899b8742d0"
+dependencies = [
+ "lazy_static",
+ "log",
+ "symphonia-core",
+ "symphonia-metadata",
+ "symphonia-utils-xiph",
+]
+
+[[package]]
+name = "symphonia-format-ogg"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b4955c67c1ed3aa8ae8428d04ca8397fbef6a19b2b051e73b5da8b1435639cb"
+dependencies = [
+ "log",
+ "symphonia-core",
+ "symphonia-metadata",
+ "symphonia-utils-xiph",
+]
+
+[[package]]
+name = "symphonia-format-riff"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c2d7c3df0e7d94efb68401d81906eae73c02b40d5ec1a141962c592d0f11a96f"
+dependencies = [
+ "extended",
+ "log",
+ "symphonia-core",
+ "symphonia-metadata",
+]
+
+[[package]]
+name = "symphonia-metadata"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "36306ff42b9ffe6e5afc99d49e121e0bd62fe79b9db7b9681d48e29fa19e6b16"
+dependencies = [
+ "encoding_rs",
+ "lazy_static",
+ "log",
+ "symphonia-core",
+]
+
+[[package]]
+name = "symphonia-utils-xiph"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee27c85ab799a338446b68eec77abf42e1a6f1bb490656e121c6e27bfbab9f16"
+dependencies = [
+ "symphonia-core",
+ "symphonia-metadata",
+]
+
[[package]]
name = "syn"
version = "1.0.109"
@@ -9386,6 +9742,12 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e"
+[[package]]
+name = "unit-prefix"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "81e544489bf3d8ef66c953931f56617f423cd4b5494be343d9b9d3dda037b9a3"
+
[[package]]
name = "universal-hash"
version = "0.5.1"
@@ -9422,6 +9784,42 @@ dependencies = [
"webpki-roots 0.26.11",
]
+[[package]]
+name = "ureq"
+version = "3.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fdc97a28575b85cfedf2a7e7d3cc64b3e11bd8ac766666318003abbacc7a21fc"
+dependencies = [
+ "base64 0.22.1",
+ "cookie_store",
+ "der",
+ "flate2",
+ "log",
+ "native-tls",
+ "percent-encoding",
+ "rustls 0.23.36",
+ "rustls-pki-types",
+ "serde",
+ "serde_json",
+ "socks",
+ "ureq-proto",
+ "utf-8",
+ "webpki-root-certs",
+ "webpki-roots 1.0.6",
+]
+
+[[package]]
+name = "ureq-proto"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d81f9efa9df032be5934a46a068815a10a042b494b6a58cb0a1a97bb5467ed6f"
+dependencies = [
+ "base64 0.22.1",
+ "http",
+ "httparse",
+ "log",
+]
+
[[package]]
name = "url"
version = "2.5.8"
@@ -9681,6 +10079,15 @@ dependencies = [
"wasm-bindgen",
]
+[[package]]
+name = "webpki-root-certs"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "804f18a4ac2676ffb4e8b5b5fa9ae38af06df08162314f96a68d2a363e21a8ca"
+dependencies = [
+ "rustls-pki-types",
+]
+
[[package]]
name = "webpki-roots"
version = "0.25.4"
@@ -9722,6 +10129,28 @@ dependencies = [
"winsafe",
]
+[[package]]
+name = "whisper-rs"
+version = "0.15.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "71ea5d2401f30f51d08126a2d133fee4c1955136519d7ac6cf6f5ac0a91e6bc8"
+dependencies = [
+ "libc",
+ "whisper-rs-sys",
+]
+
+[[package]]
+name = "whisper-rs-sys"
+version = "0.14.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5e2a6e06e7ac7b8f53c53a5f50bb0bc823ba69b63ecd887339f807a5598bbd2"
+dependencies = [
+ "bindgen",
+ "cfg-if",
+ "cmake",
+ "fs_extra",
+]
+
[[package]]
name = "whoami"
version = "1.6.1"
diff --git a/Cargo.toml b/Cargo.toml
index fbd05795a..5c9f581fb 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -134,11 +134,17 @@ tempfile = "3"
# Prometheus metrics (optional, behind "metrics" feature)
prometheus = { version = "0.13", optional = true }
+whisper-rs = { version = "0.15", optional = true, features = ["vulkan"] }
+hf-hub = { version = "0.5", optional = true }
+symphonia = { version = "0.5", features = ["mp3", "aac", "flac", "ogg", "wav", "isomp4"], optional = true }
+ogg = { version = "0.9", optional = true }
+opus = { version = "0.3", optional = true }
pdf-extract = "0.10.0"
open = "5.3.3"
urlencoding = "2.1.3"
[features]
+stt-whisper = ["dep:whisper-rs", "dep:hf-hub", "dep:symphonia", "dep:ogg", "dep:opus"]
metrics = ["dep:prometheus"]
[lints.clippy]
diff --git a/README.md b/README.md
index 82c98caf9..5bd52cf01 100644
--- a/README.md
+++ b/README.md
@@ -193,6 +193,30 @@ channel = "my-provider/my-model"
Additional built-in providers include **NVIDIA**, **MiniMax**, **Moonshot AI (Kimi)**, and **Z.AI Coding Plan** — configure with `nvidia_key`, `minimax_key`, `moonshot_key`, or `zai_coding_plan_key` in `[llm]`.
+### Voice Transcription
+
+Audio attachments (voice messages, audio files) are transcribed before being passed to the channel. Set `routing.voice` to choose the backend:
+
+**Provider-based** — route through any configured LLM provider that supports audio input:
+
+```toml
+[defaults.routing]
+voice = "openai/whisper-1"
+```
+
+**Local Whisper** (`stt-whisper` feature, requires `--features stt-whisper` at build time) — run inference locally via [whisper-rs](https://codeberg.org/tazz4843/whisper-rs), no API call needed:
+
+```toml
+[defaults.routing]
+voice = "whisper-local://small"
+```
+
+The model is downloaded automatically from [`ggerganov/whisper.cpp`](https://huggingface.co/ggerganov/whisper.cpp) on first use and cached in `~/.cache/huggingface/hub`. Supported size names: `tiny`, `tiny.en`, `base`, `base.en`, `small`, `small.en`, `medium`, `medium.en`, `large`, `large-v1`, `large-v2`, `large-v3`. An absolute path to a GGML model file also works.
+
+GPU acceleration via Vulkan is enabled automatically when a compatible device is detected. The loaded model is cached for the process lifetime — restart to switch models.
+
+Ogg/Opus audio (Telegram voice messages) is decoded natively. All other formats are handled via symphonia.
+
### Skills
Extensible skill system integrated with [skills.sh](https://skills.sh):
diff --git a/src/agent/channel.rs b/src/agent/channel.rs
index f6e419e3b..5bb76de30 100644
--- a/src/agent/channel.rs
+++ b/src/agent/channel.rs
@@ -1841,6 +1841,32 @@ async fn transcribe_audio_attachment(
));
}
+ // Local Whisper backend — bypass the LLM provider path entirely.
+ #[cfg(feature = "stt-whisper")]
+ if let Some(model_spec) = voice_model.strip_prefix("whisper-local://") {
+ let transcript = match crate::stt::transcribe(model_spec, &bytes).await {
+ Ok(text) if text.is_empty() => {
+ tracing::warn!(filename = %attachment.filename, "local Whisper returned empty transcript");
+ return UserContent::text(format!(
+ "[Audio transcription returned empty text for {}]",
+ attachment.filename
+ ));
+ }
+ Ok(text) => text,
+ Err(error) => {
+ tracing::warn!(%error, filename = %attachment.filename, "local Whisper transcription failed");
+ return UserContent::text(format!(
+ "[Audio transcription failed for {}: {}]",
+ attachment.filename, error
+ ));
+ }
+ };
+ return UserContent::text(format!(
+ "\n{}\n",
+ attachment.filename, attachment.mime_type, transcript
+ ));
+ }
+
let (provider_id, model_name) = match deps.llm_manager.resolve_model(voice_model) {
Ok(parts) => parts,
Err(error) => {
diff --git a/src/lib.rs b/src/lib.rs
index ed80aed32..be4eb274d 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -22,6 +22,8 @@ pub mod skills;
#[cfg(feature = "metrics")]
pub mod telemetry;
pub mod tools;
+#[cfg(feature = "stt-whisper")]
+pub mod stt;
pub mod update;
pub use error::{Error, Result};
diff --git a/src/stt.rs b/src/stt.rs
new file mode 100644
index 000000000..d07afa400
--- /dev/null
+++ b/src/stt.rs
@@ -0,0 +1,348 @@
+//! Local Whisper speech-to-text via whisper-rs.
+//!
+//! Only compiled when the `stt-whisper` feature is enabled.
+//! Exposed as a single async `transcribe` function that lazily loads and caches
+//! the model context for the lifetime of the process.
+
+#[cfg(feature = "stt-whisper")]
+pub use local::transcribe;
+
+#[cfg(feature = "stt-whisper")]
+mod local {
+ use std::sync::OnceLock;
+
+ use hf_hub::api::sync::Api;
+ use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextParameters};
+
+ /// Known model size names and their GGML filenames on `ggerganov/whisper.cpp`.
+ const KNOWN_SIZES: &[(&str, &str)] = &[
+ ("tiny", "ggml-tiny.bin"),
+ ("tiny.en", "ggml-tiny.en.bin"),
+ ("base", "ggml-base.bin"),
+ ("base.en", "ggml-base.en.bin"),
+ ("small", "ggml-small.bin"),
+ ("small.en", "ggml-small.en.bin"),
+ ("medium", "ggml-medium.bin"),
+ ("medium.en", "ggml-medium.en.bin"),
+ ("large", "ggml-large-v3.bin"),
+ ("large-v1", "ggml-large-v1.bin"),
+ ("large-v2", "ggml-large-v2.bin"),
+ ("large-v3", "ggml-large-v3.bin"),
+ ];
+
+ /// Cached (model_spec, WhisperContext) — one per process.
+ ///
+ /// If the user changes `routing.voice` at runtime we just keep using the
+ /// already-loaded model; a restart is required to switch models.
+ static CONTEXT: OnceLock<(String, WhisperContext)> = OnceLock::new();
+
+ #[derive(Debug, thiserror::Error)]
+ pub enum WhisperError {
+ #[error("model not found and could not be downloaded: {0}")]
+ ModelNotFound(String),
+ #[error("hf-hub error: {0}")]
+ HfHub(String),
+ #[error("failed to load whisper model: {0}")]
+ Load(String),
+ #[error("failed to create whisper state: {0}")]
+ State(String),
+ #[error("transcription failed: {0}")]
+ Transcription(String),
+ #[error("audio decode error: {0}")]
+ Decode(String),
+ }
+
+ /// Transcribe raw audio bytes using the local Whisper model.
+ ///
+ /// `model_spec` is the part after `whisper-local://`:
+ /// - A known size name (`small`, `medium`, `large`, …) — downloaded from HF
+ /// into the HF cache on first use.
+ /// - An absolute path (`/path/to/ggml-small.bin`) — loaded directly.
+ pub async fn transcribe(model_spec: &str, audio: &[u8]) -> Result {
+ let model_spec = model_spec.to_owned();
+ let audio = audio.to_vec();
+
+ // Whisper inference is CPU-bound and blocking — run on a thread pool.
+ tokio::task::spawn_blocking(move || transcribe_blocking(&model_spec, &audio))
+ .await
+ .map_err(|e| WhisperError::Transcription(e.to_string()))?
+ }
+
+ fn transcribe_blocking(model_spec: &str, audio: &[u8]) -> Result {
+ let ctx = get_or_load_context(model_spec)?;
+
+ let mut state = ctx
+ .create_state()
+ .map_err(|e| WhisperError::State(e.to_string()))?;
+
+ let samples = decode_to_f32(audio)?;
+
+ let mut params = FullParams::new(SamplingStrategy::Greedy { best_of: 1 });
+ params.set_language(Some("auto"));
+ params.set_print_progress(false);
+ params.set_print_realtime(false);
+ params.set_print_timestamps(false);
+
+ state
+ .full(params, &samples)
+ .map_err(|e| WhisperError::Transcription(e.to_string()))?;
+
+ let n = state.full_n_segments();
+ let mut parts = Vec::with_capacity(n as usize);
+ for i in 0..n {
+ if let Some(segment) = state.get_segment(i) {
+ if let Ok(text) = segment.to_str() {
+ let trimmed = text.trim();
+ if !trimmed.is_empty() {
+ parts.push(trimmed.to_owned());
+ }
+ }
+ }
+ }
+
+ Ok(parts.join(" "))
+ }
+
+ /// Return the cached context, loading it first if necessary.
+ fn get_or_load_context(model_spec: &str) -> Result<&'static WhisperContext, WhisperError> {
+ if let Some((_, ctx)) = CONTEXT.get() {
+ return Ok(ctx);
+ }
+
+ let model_path = resolve_model_path(model_spec)?;
+
+ tracing::info!(model_path = %model_path, "loading local Whisper model");
+
+ let params = WhisperContextParameters::default();
+ let ctx = WhisperContext::new_with_params(&model_path, params)
+ .map_err(|e| WhisperError::Load(e.to_string()))?;
+
+ let _ = CONTEXT.set((model_spec.to_owned(), ctx));
+
+ tracing::info!(model_path = %model_path, "Whisper model loaded and cached");
+
+ Ok(&CONTEXT.get().unwrap().1)
+ }
+
+ /// Resolve a model spec to an absolute path on disk, downloading via hf-hub if needed.
+ fn resolve_model_path(spec: &str) -> Result {
+ // Absolute path — use directly.
+ if spec.starts_with('/') {
+ if std::path::Path::new(spec).exists() {
+ return Ok(spec.to_owned());
+ }
+ return Err(WhisperError::ModelNotFound(format!(
+ "model file not found: {spec}"
+ )));
+ }
+
+ // Known size name — fetch via hf-hub (uses HF_HOME cache, downloads if missing).
+ let filename = KNOWN_SIZES
+ .iter()
+ .find(|(name, _)| *name == spec)
+ .map(|(_, file)| *file)
+ .ok_or_else(|| {
+ WhisperError::ModelNotFound(format!(
+ "unknown model size '{spec}'; use one of: {}",
+ KNOWN_SIZES
+ .iter()
+ .map(|(n, _)| *n)
+ .collect::>()
+ .join(", ")
+ ))
+ })?;
+
+ tracing::info!(model = %spec, filename = %filename, "fetching Whisper model via hf-hub");
+
+ let api = Api::new().map_err(|e| WhisperError::HfHub(e.to_string()))?;
+ let repo = api.model("ggerganov/whisper.cpp".to_owned());
+ let path = repo
+ .get(filename)
+ .map_err(|e| WhisperError::HfHub(e.to_string()))?;
+
+ Ok(path.to_string_lossy().to_string())
+ }
+
+ /// Decode arbitrary audio bytes to 16 kHz mono f32 samples for Whisper.
+ ///
+ /// Ogg/Opus (Telegram voice messages) is handled directly via the `ogg` +
+ /// `opus` crates. Everything else falls through to symphonia.
+ fn decode_to_f32(audio: &[u8]) -> Result, WhisperError> {
+ if is_ogg_opus(audio) {
+ return decode_ogg_opus(audio);
+ }
+
+ use symphonia::core::codecs::DecoderOptions;
+ use symphonia::core::formats::FormatOptions;
+ use symphonia::core::io::MediaSourceStream;
+ use symphonia::core::meta::MetadataOptions;
+ use symphonia::core::probe::Hint;
+
+ let cursor = std::io::Cursor::new(audio.to_vec());
+ let mss = MediaSourceStream::new(Box::new(cursor), Default::default());
+
+ let probed = symphonia::default::get_probe()
+ .format(
+ &Hint::new(),
+ mss,
+ &FormatOptions::default(),
+ &MetadataOptions::default(),
+ )
+ .map_err(|e| WhisperError::Decode(e.to_string()))?;
+
+ let mut format = probed.format;
+ let track = format
+ .tracks()
+ .iter()
+ .find(|t| {
+ t.codec_params.codec != symphonia::core::codecs::CODEC_TYPE_NULL
+ })
+ .ok_or_else(|| WhisperError::Decode("no audio track found".into()))?
+ .clone();
+
+ let mut decoder = symphonia::default::get_codecs()
+ .make(&track.codec_params, &DecoderOptions::default())
+ .map_err(|e| WhisperError::Decode(e.to_string()))?;
+
+ let track_id = track.id;
+ let sample_rate = track.codec_params.sample_rate.unwrap_or(16000);
+ let channels = track
+ .codec_params
+ .channels
+ .map(|c| c.count())
+ .unwrap_or(1);
+
+ let mut raw_samples: Vec = Vec::new();
+
+ loop {
+ let packet = match format.next_packet() {
+ Ok(p) => p,
+ Err(symphonia::core::errors::Error::IoError(_)) => break,
+ Err(symphonia::core::errors::Error::ResetRequired) => break,
+ Err(e) => return Err(WhisperError::Decode(e.to_string())),
+ };
+
+ if packet.track_id() != track_id {
+ continue;
+ }
+
+ let decoded = decoder
+ .decode(&packet)
+ .map_err(|e| WhisperError::Decode(e.to_string()))?;
+
+ // Convert to f32 mono using a sample-converting audio buffer.
+ use symphonia::core::audio::{AudioBuffer, Signal as _};
+
+ let mut f32_buf: AudioBuffer = AudioBuffer::new(
+ decoded.capacity() as u64,
+ decoded.spec().clone(),
+ );
+ decoded.convert(&mut f32_buf);
+
+ // Mix down to mono.
+ let frames = f32_buf.frames();
+ for frame in 0..frames {
+ let mut sum = 0f32;
+ for ch in 0..channels {
+ sum += f32_buf.chan(ch)[frame];
+ }
+ raw_samples.push(sum / channels as f32);
+ }
+ }
+
+ // Resample to 16 kHz if needed.
+ if sample_rate != 16000 {
+ raw_samples = resample(raw_samples, sample_rate, 16000);
+ }
+
+ Ok(raw_samples)
+ }
+
+ /// Check if the audio is an Ogg container with an Opus stream.
+ fn is_ogg_opus(audio: &[u8]) -> bool {
+ // OggS capture pattern at offset 0, and OpusHead magic at offset 28
+ // (first packet of the first logical stream).
+ audio.starts_with(b"OggS") && audio.len() > 36 && &audio[28..36] == b"OpusHead"
+ }
+
+ /// Decode Ogg/Opus audio to 16 kHz mono f32 samples.
+ fn decode_ogg_opus(audio: &[u8]) -> Result, WhisperError> {
+ use ogg::reading::PacketReader;
+
+ let cursor = std::io::Cursor::new(audio);
+ let mut reader = PacketReader::new(cursor);
+
+ // Skip the OpusHead and OpusTags header packets.
+ let mut header_packets = 0;
+ let mut decoder: Option = None;
+ let mut sample_rate = 48000u32;
+ let mut channels = 1usize;
+ let mut samples: Vec = Vec::new();
+
+ while let Ok(Some(packet)) = reader.read_packet() {
+ if header_packets < 2 {
+ if header_packets == 0 {
+ // Parse OpusHead to get channel count and pre-skip.
+ if packet.data.len() >= 11 && &packet.data[0..8] == b"OpusHead" {
+ channels = packet.data[9] as usize;
+ // Output sample rate is always 48000 for libopus.
+ sample_rate = 48000;
+ }
+ decoder = Some(
+ opus::Decoder::new(sample_rate, if channels == 2 {
+ opus::Channels::Stereo
+ } else {
+ opus::Channels::Mono
+ })
+ .map_err(|e| WhisperError::Decode(e.to_string()))?,
+ );
+ }
+ header_packets += 1;
+ continue;
+ }
+
+ let dec = decoder.as_mut().unwrap();
+ // Max Opus frame: 120ms at 48kHz = 5760 samples per channel.
+ let max_samples = 5760 * channels;
+ let mut pcm = vec![0f32; max_samples];
+ let n = dec
+ .decode_float(&packet.data, &mut pcm, false)
+ .map_err(|e| WhisperError::Decode(e.to_string()))?;
+
+ // Mix down to mono.
+ if channels == 1 {
+ samples.extend_from_slice(&pcm[..n]);
+ } else {
+ for frame in 0..n {
+ let mut sum = 0f32;
+ for ch in 0..channels {
+ sum += pcm[frame * channels + ch];
+ }
+ samples.push(sum / channels as f32);
+ }
+ }
+ }
+
+ // Resample from 48 kHz to 16 kHz.
+ Ok(resample(samples, sample_rate, 16000))
+ }
+
+ /// Simple linear resampler (good enough for speech; not for music).
+ fn resample(samples: Vec, from_hz: u32, to_hz: u32) -> Vec {
+ if from_hz == to_hz {
+ return samples;
+ }
+ let ratio = from_hz as f64 / to_hz as f64;
+ let out_len = (samples.len() as f64 / ratio) as usize;
+ let mut out = Vec::with_capacity(out_len);
+ for i in 0..out_len {
+ let pos = i as f64 * ratio;
+ let idx = pos as usize;
+ let frac = (pos - idx as f64) as f32;
+ let a = samples.get(idx).copied().unwrap_or(0.0);
+ let b = samples.get(idx + 1).copied().unwrap_or(0.0);
+ out.push(a + frac * (b - a));
+ }
+ out
+ }
+}