spacedriveapp · Marenz · Feb 21, 2026 · Feb 21, 2026 · Feb 21, 2026 · Feb 23, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -150,12 +150,18 @@ tempfile = "3"
 
 # Prometheus metrics (optional, behind "metrics" feature)
 prometheus = { version = "0.13", optional = true }
+whisper-rs = { version = "0.15", optional = true, features = ["vulkan"] }
+hf-hub = { version = "0.5", optional = true }
+symphonia = { version = "0.5", features = ["mp3", "aac", "flac", "ogg", "wav", "isomp4"], optional = true }
+ogg = { version = "0.9", optional = true }
+opus = { version = "0.3", optional = true }
 pdf-extract = "0.10.0"
 open = "5.3.3"
 urlencoding = "2.1.3"
 moka = "0.12.13"
 
 [features]
+stt-whisper = ["dep:whisper-rs", "dep:hf-hub", "dep:symphonia", "dep:ogg", "dep:opus"]
 metrics = ["dep:prometheus"]
 
 [lints.clippy]

diff --git a/README.md b/README.md
@@ -197,6 +197,30 @@ channel = "my-provider/my-model"
 
 Additional built-in providers include **Kilo Gateway**, **OpenCode Go**, **NVIDIA**, **MiniMax**, **Moonshot AI (Kimi)**, and **Z.AI Coding Plan** — configure with `kilo_key`, `opencode_go_key`, `nvidia_key`, `minimax_key`, `moonshot_key`, or `zai_coding_plan_key` in `[llm]`.
 
+### Voice Transcription
+
+Audio attachments (voice messages, audio files) are transcribed before being passed to the channel. Set `routing.voice` to choose the backend:
+
+**Provider-based** — route through any configured LLM provider that supports audio input:
+
+```toml
+[defaults.routing]
+voice = "openai/whisper-1"
+```
+
+**Local Whisper** (`stt-whisper` feature, requires `--features stt-whisper` at build time) — run inference locally via [whisper-rs](https://codeberg.org/tazz4843/whisper-rs), no API call needed:
+
+```toml
+[defaults.routing]
+voice = "whisper-local://small"
+```
+
+The model is downloaded automatically from [`ggerganov/whisper.cpp`](https://huggingface.co/ggerganov/whisper.cpp) on first use and cached in `~/.cache/huggingface/hub`. Supported size names: `tiny`, `tiny.en`, `base`, `base.en`, `small`, `small.en`, `medium`, `medium.en`, `large`, `large-v1`, `large-v2`, `large-v3`. An absolute path to a GGML model file also works.
+
+GPU acceleration via Vulkan is enabled automatically when a compatible device is detected. The loaded model is cached for the process lifetime — restart to switch models.
+
+Ogg/Opus audio (Telegram voice messages) is decoded natively. All other formats are handled via symphonia.
+
 ### Skills
 
 Extensible skill system integrated with [skills.sh](https://skills.sh):

diff --git a/prompts/en/tools/transcribe_audio_description.md.j2 b/prompts/en/tools/transcribe_audio_description.md.j2
@@ -0,0 +1 @@
+Transcribe an audio file to text using local speech-to-text. Provide the path to the audio file. Supports ogg, opus, mp3, flac, wav, and m4a formats. Use this instead of external whisper CLI tools.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Transcribe an audio file to text using local speech-to-text. Provide the path to the audio file. Supports ogg, opus, mp3, flac, wav, and m4a formats. Use this instead of external whisper CLI tools.