From f7140455cbcd88698e96f770591b2a0674cfd87a Mon Sep 17 00:00:00 2001
From: Marenz <github@supradigital.org>
Date: Sat, 21 Feb 2026 11:54:15 +0100
Subject: [PATCH 1/3] Add local Whisper STT backend via whisper-rs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When routing.voice = "whisper-local://<spec>", audio attachments are
transcribed locally instead of via the LLM provider HTTP path.

<spec> is either:
- A known size name (tiny/base/small/medium/large) — fetched from
  ggerganov/whisper.cpp on HuggingFace via hf-hub, using the existing
  HF cache if already present
- An absolute path to a GGML model file

The WhisperContext is loaded once and cached in a OnceLock for the
process lifetime. Audio decoding (ogg, opus, mp3, flac, wav, m4a) is
handled by symphonia with linear resampling to 16 kHz mono f32.

All three deps (whisper-rs, hf-hub, symphonia) are optional behind the
stt-whisper feature flag.
---
 Cargo.lock           | 409 ++++++++++++++++++++++++++++++++++++++++++-
 Cargo.toml           |   4 +
 src/agent/channel.rs |  26 +++
 src/lib.rs           |   2 +
 src/stt.rs           | 275 +++++++++++++++++++++++++++++
 5 files changed, 710 insertions(+), 6 deletions(-)
 create mode 100644 src/stt.rs
diff --git a/Cargo.lock b/Cargo.lock
index 027a6432e..a41956207 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -776,6 +776,26 @@ dependencies = [
  "num-traits",
 ]
 
+[[package]]
+name = "bindgen"
+version = "0.71.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f58bf3d7db68cfbac37cfc485a8d711e87e064c3d0fe0435b92f7a407f9d6b3"
+dependencies = [
+ "bitflags 2.10.0",
+ "cexpr",
+ "clang-sys",
+ "itertools 0.13.0",
+ "log",
+ "prettyplease",
+ "proc-macro2",
+ "quote",
+ "regex",
+ "rustc-hash",
+ "shlex",
+ "syn 2.0.114",
+]
+
 [[package]]
 name = "bit_field"
 version = "0.10.3"
@@ -1105,6 +1125,15 @@ version = "0.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4f4c707c6a209cbe82d10abd08e1ea8995e9ea937d2550646e02798948992be0"
 
+[[package]]
+name = "cexpr"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
+dependencies = [
+ "nom 7.1.3",
+]
+
 [[package]]
 name = "cff-parser"
 version = "0.1.0"
@@ -1223,6 +1252,17 @@ dependencies = [
  "inout",
 ]
 
+[[package]]
+name = "clang-sys"
+version = "1.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
+dependencies = [
+ "glob",
+ "libc",
+ "libloading",
+]
+
 [[package]]
 name = "clap"
 version = "4.5.58"
@@ -1377,6 +1417,19 @@ dependencies = [
  "windows-sys 0.59.0",
 ]
 
+[[package]]
+name = "console"
+version = "0.16.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "03e45a4a8926227e4197636ba97a9fc9b00477e9f4bd711395687c5f0734bec4"
+dependencies = [
+ "encode_unicode",
+ "libc",
+ "once_cell",
+ "unicode-width",
+ "windows-sys 0.61.2",
+]
+
 [[package]]
 name = "const-oid"
 version = "0.9.6"
@@ -1433,6 +1486,35 @@ dependencies = [
  "unicode-segmentation",
 ]
 
+[[package]]
+name = "cookie"
+version = "0.18.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ddef33a339a91ea89fb53151bd0a4689cfce27055c291dfa69945475d22c747"
+dependencies = [
+ "percent-encoding",
+ "time",
+ "version_check",
+]
+
+[[package]]
+name = "cookie_store"
+version = "0.22.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "15b2c103cf610ec6cae3da84a766285b42fd16aad564758459e6ecf128c75206"
+dependencies = [
+ "cookie",
+ "document-features",
+ "idna",
+ "indexmap 2.13.0",
+ "log",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "time",
+ "url",
+]
+
 [[package]]
 name = "core-foundation"
 version = "0.9.4"
@@ -2481,7 +2563,7 @@ version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "658bce805d770f407bc62102fca7c2c64ceef2fbcb2b8bd19d2765ce093980de"
 dependencies = [
- "console",
+ "console 0.15.11",
  "shell-words",
  "tempfile",
  "thiserror 1.0.69",
@@ -2553,6 +2635,15 @@ dependencies = [
  "const-random",
 ]
 
+[[package]]
+name = "document-features"
+version = "0.2.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d4b8a88685455ed29a21542a33abd9cb6510b6b129abadabdcef0f4c55bc8f61"
+dependencies = [
+ "litrs",
+]
+
 [[package]]
 name = "dotenvy"
 version = "0.15.7"
@@ -2793,6 +2884,12 @@ dependencies = [
  "zune-inflate",
 ]
 
+[[package]]
+name = "extended"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af9673d8203fcb076b19dfd17e38b3d4ae9f44959416ea532ce72415a6020365"
+
 [[package]]
 name = "fast-float2"
 version = "0.2.3"
@@ -2812,7 +2909,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "04c269a76bfc6cea69553b7d040acb16c793119cebd97c756d21e08d0f075ff8"
 dependencies = [
  "anyhow",
- "hf-hub",
+ "hf-hub 0.4.3",
  "image",
  "ndarray",
  "ort",
@@ -3470,7 +3567,7 @@ checksum = "629d8f3bbeda9d148036d6b0de0a3ab947abd08ce90626327fc3547a49d59d97"
 dependencies = [
  "dirs",
  "http",
- "indicatif",
+ "indicatif 0.17.11",
  "libc",
  "log",
  "native-tls",
@@ -3479,10 +3576,34 @@ dependencies = [
  "serde",
  "serde_json",
  "thiserror 2.0.18",
- "ureq",
+ "ureq 2.12.1",
  "windows-sys 0.60.2",
 ]
 
+[[package]]
+name = "hf-hub"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aef3982638978efa195ff11b305f51f1f22f4f0a6cabee7af79b383ebee6a213"
+dependencies = [
+ "dirs",
+ "futures",
+ "http",
+ "indicatif 0.18.4",
+ "libc",
+ "log",
+ "native-tls",
+ "num_cpus",
+ "rand 0.9.2",
+ "reqwest",
+ "serde",
+ "serde_json",
+ "thiserror 2.0.18",
+ "tokio",
+ "ureq 3.2.0",
+ "windows-sys 0.61.2",
+]
+
 [[package]]
 name = "hkdf"
 version = "0.12.4"
@@ -3986,13 +4107,26 @@ version = "0.17.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235"
 dependencies = [
- "console",
+ "console 0.15.11",
  "number_prefix",
  "portable-atomic",
  "unicode-width",
  "web-time",
 ]
 
+[[package]]
+name = "indicatif"
+version = "0.18.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "25470f23803092da7d239834776d653104d551bc4d7eacaf31e6837854b8e9eb"
+dependencies = [
+ "console 0.16.2",
+ "portable-atomic",
+ "unicode-width",
+ "unit-prefix",
+ "web-time",
+]
+
 [[package]]
 name = "indoc"
 version = "2.0.7"
@@ -4915,6 +5049,16 @@ dependencies = [
  "cc",
 ]
 
+[[package]]
+name = "libloading"
+version = "0.8.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55"
+dependencies = [
+ "cfg-if",
+ "windows-link 0.2.1",
+]
+
 [[package]]
 name = "libm"
 version = "0.2.16"
@@ -4961,6 +5105,12 @@ version = "0.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77"
 
+[[package]]
+name = "litrs"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "11d3d7f243d5c5a8b9bb5d6dd2b1602c0cb0b9db1621bafc7ed66e35ff9fe092"
+
 [[package]]
 name = "lock_api"
 version = "0.4.14"
@@ -5886,7 +6036,7 @@ dependencies = [
  "pkg-config",
  "sha2",
  "tar",
- "ureq",
+ "ureq 2.12.1",
 ]
 
 [[package]]
@@ -7873,6 +8023,7 @@ dependencies = [
  "dirs",
  "fastembed",
  "futures",
+ "hf-hub 0.5.0",
  "ignore",
  "indoc",
  "lance-index",
@@ -7904,6 +8055,7 @@ dependencies = [
  "sha2",
  "slack-morphism",
  "sqlx",
+ "symphonia",
  "teloxide",
  "tempfile",
  "thiserror 2.0.18",
@@ -7920,6 +8072,7 @@ dependencies = [
  "twitch-irc",
  "urlencoding",
  "uuid",
+ "whisper-rs",
  "zip",
 ]
 
@@ -8271,6 +8424,178 @@ version = "2.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
 
+[[package]]
+name = "symphonia"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5773a4c030a19d9bfaa090f49746ff35c75dfddfa700df7a5939d5e076a57039"
+dependencies = [
+ "lazy_static",
+ "symphonia-bundle-flac",
+ "symphonia-bundle-mp3",
+ "symphonia-codec-aac",
+ "symphonia-codec-adpcm",
+ "symphonia-codec-pcm",
+ "symphonia-codec-vorbis",
+ "symphonia-core",
+ "symphonia-format-isomp4",
+ "symphonia-format-mkv",
+ "symphonia-format-ogg",
+ "symphonia-format-riff",
+ "symphonia-metadata",
+]
+
+[[package]]
+name = "symphonia-bundle-flac"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c91565e180aea25d9b80a910c546802526ffd0072d0b8974e3ebe59b686c9976"
+dependencies = [
+ "log",
+ "symphonia-core",
+ "symphonia-metadata",
+ "symphonia-utils-xiph",
+]
+
+[[package]]
+name = "symphonia-bundle-mp3"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4872dd6bb56bf5eac799e3e957aa1981086c3e613b27e0ac23b176054f7c57ed"
+dependencies = [
+ "lazy_static",
+ "log",
+ "symphonia-core",
+ "symphonia-metadata",
+]
+
+[[package]]
+name = "symphonia-codec-aac"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c263845aa86881416849c1729a54c7f55164f8b96111dba59de46849e73a790"
+dependencies = [
+ "lazy_static",
+ "log",
+ "symphonia-core",
+]
+
+[[package]]
+name = "symphonia-codec-adpcm"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2dddc50e2bbea4cfe027441eece77c46b9f319748605ab8f3443350129ddd07f"
+dependencies = [
+ "log",
+ "symphonia-core",
+]
+
+[[package]]
+name = "symphonia-codec-pcm"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4e89d716c01541ad3ebe7c91ce4c8d38a7cf266a3f7b2f090b108fb0cb031d95"
+dependencies = [
+ "log",
+ "symphonia-core",
+]
+
+[[package]]
+name = "symphonia-codec-vorbis"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f025837c309cd69ffef572750b4a2257b59552c5399a5e49707cc5b1b85d1c73"
+dependencies = [
+ "log",
+ "symphonia-core",
+ "symphonia-utils-xiph",
+]
+
+[[package]]
+name = "symphonia-core"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ea00cc4f79b7f6bb7ff87eddc065a1066f3a43fe1875979056672c9ef948c2af"
+dependencies = [
+ "arrayvec",
+ "bitflags 1.3.2",
+ "bytemuck",
+ "lazy_static",
+ "log",
+]
+
+[[package]]
+name = "symphonia-format-isomp4"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "243739585d11f81daf8dac8d9f3d18cc7898f6c09a259675fc364b382c30e0a5"
+dependencies = [
+ "encoding_rs",
+ "log",
+ "symphonia-core",
+ "symphonia-metadata",
+ "symphonia-utils-xiph",
+]
+
+[[package]]
+name = "symphonia-format-mkv"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "122d786d2c43a49beb6f397551b4a050d8229eaa54c7ddf9ee4b98899b8742d0"
+dependencies = [
+ "lazy_static",
+ "log",
+ "symphonia-core",
+ "symphonia-metadata",
+ "symphonia-utils-xiph",
+]
+
+[[package]]
+name = "symphonia-format-ogg"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b4955c67c1ed3aa8ae8428d04ca8397fbef6a19b2b051e73b5da8b1435639cb"
+dependencies = [
+ "log",
+ "symphonia-core",
+ "symphonia-metadata",
+ "symphonia-utils-xiph",
+]
+
+[[package]]
+name = "symphonia-format-riff"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c2d7c3df0e7d94efb68401d81906eae73c02b40d5ec1a141962c592d0f11a96f"
+dependencies = [
+ "extended",
+ "log",
+ "symphonia-core",
+ "symphonia-metadata",
+]
+
+[[package]]
+name = "symphonia-metadata"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "36306ff42b9ffe6e5afc99d49e121e0bd62fe79b9db7b9681d48e29fa19e6b16"
+dependencies = [
+ "encoding_rs",
+ "lazy_static",
+ "log",
+ "symphonia-core",
+]
+
+[[package]]
+name = "symphonia-utils-xiph"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee27c85ab799a338446b68eec77abf42e1a6f1bb490656e121c6e27bfbab9f16"
+dependencies = [
+ "symphonia-core",
+ "symphonia-metadata",
+]
+
 [[package]]
 name = "syn"
 version = "1.0.109"
@@ -9386,6 +9711,12 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e"
 
+[[package]]
+name = "unit-prefix"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "81e544489bf3d8ef66c953931f56617f423cd4b5494be343d9b9d3dda037b9a3"
+
 [[package]]
 name = "universal-hash"
 version = "0.5.1"
@@ -9422,6 +9753,42 @@ dependencies = [
  "webpki-roots 0.26.11",
 ]
 
+[[package]]
+name = "ureq"
+version = "3.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fdc97a28575b85cfedf2a7e7d3cc64b3e11bd8ac766666318003abbacc7a21fc"
+dependencies = [
+ "base64 0.22.1",
+ "cookie_store",
+ "der",
+ "flate2",
+ "log",
+ "native-tls",
+ "percent-encoding",
+ "rustls 0.23.36",
+ "rustls-pki-types",
+ "serde",
+ "serde_json",
+ "socks",
+ "ureq-proto",
+ "utf-8",
+ "webpki-root-certs",
+ "webpki-roots 1.0.6",
+]
+
+[[package]]
+name = "ureq-proto"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d81f9efa9df032be5934a46a068815a10a042b494b6a58cb0a1a97bb5467ed6f"
+dependencies = [
+ "base64 0.22.1",
+ "http",
+ "httparse",
+ "log",
+]
+
 [[package]]
 name = "url"
 version = "2.5.8"
@@ -9681,6 +10048,15 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "webpki-root-certs"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "804f18a4ac2676ffb4e8b5b5fa9ae38af06df08162314f96a68d2a363e21a8ca"
+dependencies = [
+ "rustls-pki-types",
+]
+
 [[package]]
 name = "webpki-roots"
 version = "0.25.4"
@@ -9722,6 +10098,27 @@ dependencies = [
  "winsafe",
 ]
 
+[[package]]
+name = "whisper-rs"
+version = "0.15.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "71ea5d2401f30f51d08126a2d133fee4c1955136519d7ac6cf6f5ac0a91e6bc8"
+dependencies = [
+ "whisper-rs-sys",
+]
+
+[[package]]
+name = "whisper-rs-sys"
+version = "0.14.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5e2a6e06e7ac7b8f53c53a5f50bb0bc823ba69b63ecd887339f807a5598bbd2"
+dependencies = [
+ "bindgen",
+ "cfg-if",
+ "cmake",
+ "fs_extra",
+]
+
 [[package]]
 name = "whoami"
 version = "1.6.1"
diff --git a/Cargo.toml b/Cargo.toml
index fbd05795a..f39ec2b5d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -134,11 +134,15 @@ tempfile = "3"
 
 # Prometheus metrics (optional, behind "metrics" feature)
 prometheus = { version = "0.13", optional = true }
+whisper-rs = { version = "0.15", optional = true }
+hf-hub = { version = "0.5", optional = true }
+symphonia = { version = "0.5", features = ["mp3", "aac", "flac", "ogg", "wav", "isomp4"], optional = true }
 pdf-extract = "0.10.0"
 open = "5.3.3"
 urlencoding = "2.1.3"
 
 [features]
+stt-whisper = ["dep:whisper-rs", "dep:hf-hub", "dep:symphonia"]
 metrics = ["dep:prometheus"]
 
 [lints.clippy]
diff --git a/src/agent/channel.rs b/src/agent/channel.rs
index f6e419e3b..5bb76de30 100644
--- a/src/agent/channel.rs
+++ b/src/agent/channel.rs
@@ -1841,6 +1841,32 @@ async fn transcribe_audio_attachment(
         ));
     }
 
+    // Local Whisper backend — bypass the LLM provider path entirely.
+    #[cfg(feature = "stt-whisper")]
+    if let Some(model_spec) = voice_model.strip_prefix("whisper-local://") {
+        let transcript = match crate::stt::transcribe(model_spec, &bytes).await {
+            Ok(text) if text.is_empty() => {
+                tracing::warn!(filename = %attachment.filename, "local Whisper returned empty transcript");
+                return UserContent::text(format!(
+                    "[Audio transcription returned empty text for {}]",
+                    attachment.filename
+                ));
+            }
+            Ok(text) => text,
+            Err(error) => {
+                tracing::warn!(%error, filename = %attachment.filename, "local Whisper transcription failed");
+                return UserContent::text(format!(
+                    "[Audio transcription failed for {}: {}]",
+                    attachment.filename, error
+                ));
+            }
+        };
+        return UserContent::text(format!(
+            "<voice_transcript name=\"{}\" mime=\"{}\">\n{}\n</voice_transcript>",
+            attachment.filename, attachment.mime_type, transcript
+        ));
+    }
+
     let (provider_id, model_name) = match deps.llm_manager.resolve_model(voice_model) {
         Ok(parts) => parts,
         Err(error) => {
diff --git a/src/lib.rs b/src/lib.rs
index ed80aed32..be4eb274d 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -22,6 +22,8 @@ pub mod skills;
 #[cfg(feature = "metrics")]
 pub mod telemetry;
 pub mod tools;
+#[cfg(feature = "stt-whisper")]
+pub mod stt;
 pub mod update;
 
 pub use error::{Error, Result};
diff --git a/src/stt.rs b/src/stt.rs
new file mode 100644
index 000000000..2e0e5a8cc
--- /dev/null
+++ b/src/stt.rs
@@ -0,0 +1,275 @@
+//! Local Whisper speech-to-text via whisper-rs.
+//!
+//! Only compiled when the `stt-whisper` feature is enabled.
+//! Exposed as a single async `transcribe` function that lazily loads and caches
+//! the model context for the lifetime of the process.
+
+#[cfg(feature = "stt-whisper")]
+pub use local::transcribe;
+
+#[cfg(feature = "stt-whisper")]
+mod local {
+    use std::sync::OnceLock;
+
+    use hf_hub::api::sync::Api;
+    use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextParameters};
+
+    /// Known model size names and their GGML filenames on `ggerganov/whisper.cpp`.
+    const KNOWN_SIZES: &[(&str, &str)] = &[
+        ("tiny", "ggml-tiny.bin"),
+        ("tiny.en", "ggml-tiny.en.bin"),
+        ("base", "ggml-base.bin"),
+        ("base.en", "ggml-base.en.bin"),
+        ("small", "ggml-small.bin"),
+        ("small.en", "ggml-small.en.bin"),
+        ("medium", "ggml-medium.bin"),
+        ("medium.en", "ggml-medium.en.bin"),
+        ("large", "ggml-large-v3.bin"),
+        ("large-v1", "ggml-large-v1.bin"),
+        ("large-v2", "ggml-large-v2.bin"),
+        ("large-v3", "ggml-large-v3.bin"),
+    ];
+
+    /// Cached (model_spec, WhisperContext) — one per process.
+    ///
+    /// If the user changes `routing.voice` at runtime we just keep using the
+    /// already-loaded model; a restart is required to switch models.
+    static CONTEXT: OnceLock<(String, WhisperContext)> = OnceLock::new();
+
+    #[derive(Debug, thiserror::Error)]
+    pub enum WhisperError {
+        #[error("model not found and could not be downloaded: {0}")]
+        ModelNotFound(String),
+        #[error("hf-hub error: {0}")]
+        HfHub(String),
+        #[error("failed to load whisper model: {0}")]
+        Load(String),
+        #[error("failed to create whisper state: {0}")]
+        State(String),
+        #[error("transcription failed: {0}")]
+        Transcription(String),
+        #[error("audio decode error: {0}")]
+        Decode(String),
+    }
+
+    /// Transcribe raw audio bytes using the local Whisper model.
+    ///
+    /// `model_spec` is the part after `whisper-local://`:
+    /// - A known size name (`small`, `medium`, `large`, …) — downloaded from HF
+    ///   into the HF cache on first use.
+    /// - An absolute path (`/path/to/ggml-small.bin`) — loaded directly.
+    pub async fn transcribe(model_spec: &str, audio: &[u8]) -> Result<String, WhisperError> {
+        let model_spec = model_spec.to_owned();
+        let audio = audio.to_vec();
+
+        // Whisper inference is CPU-bound and blocking — run on a thread pool.
+        tokio::task::spawn_blocking(move || transcribe_blocking(&model_spec, &audio))
+            .await
+            .map_err(|e| WhisperError::Transcription(e.to_string()))?
+    }
+
+    fn transcribe_blocking(model_spec: &str, audio: &[u8]) -> Result<String, WhisperError> {
+        let ctx = get_or_load_context(model_spec)?;
+
+        let mut state = ctx
+            .create_state()
+            .map_err(|e| WhisperError::State(e.to_string()))?;
+
+        let samples = decode_to_f32(audio)?;
+
+        let mut params = FullParams::new(SamplingStrategy::Greedy { best_of: 1 });
+        params.set_language(Some("auto"));
+        params.set_print_progress(false);
+        params.set_print_realtime(false);
+        params.set_print_timestamps(false);
+
+        state
+            .full(params, &samples)
+            .map_err(|e| WhisperError::Transcription(e.to_string()))?;
+
+        let n = state.full_n_segments();
+        let mut parts = Vec::with_capacity(n as usize);
+        for i in 0..n {
+            if let Some(segment) = state.get_segment(i) {
+                if let Ok(text) = segment.to_str() {
+                    let trimmed = text.trim();
+                    if !trimmed.is_empty() {
+                        parts.push(trimmed.to_owned());
+                    }
+                }
+            }
+        }
+
+        Ok(parts.join(" "))
+    }
+
+    /// Return the cached context, loading it first if necessary.
+    fn get_or_load_context(model_spec: &str) -> Result<&'static WhisperContext, WhisperError> {
+        if let Some((_, ctx)) = CONTEXT.get() {
+            return Ok(ctx);
+        }
+
+        let model_path = resolve_model_path(model_spec)?;
+
+        tracing::info!(model_path = %model_path, "loading local Whisper model");
+
+        let params = WhisperContextParameters::default();
+        let ctx = WhisperContext::new_with_params(&model_path, params)
+            .map_err(|e| WhisperError::Load(e.to_string()))?;
+
+        let _ = CONTEXT.set((model_spec.to_owned(), ctx));
+
+        tracing::info!(model_path = %model_path, "Whisper model loaded and cached");
+
+        Ok(&CONTEXT.get().unwrap().1)
+    }
+
+    /// Resolve a model spec to an absolute path on disk, downloading via hf-hub if needed.
+    fn resolve_model_path(spec: &str) -> Result<String, WhisperError> {
+        // Absolute path — use directly.
+        if spec.starts_with('/') {
+            if std::path::Path::new(spec).exists() {
+                return Ok(spec.to_owned());
+            }
+            return Err(WhisperError::ModelNotFound(format!(
+                "model file not found: {spec}"
+            )));
+        }
+
+        // Known size name — fetch via hf-hub (uses HF_HOME cache, downloads if missing).
+        let filename = KNOWN_SIZES
+            .iter()
+            .find(|(name, _)| *name == spec)
+            .map(|(_, file)| *file)
+            .ok_or_else(|| {
+                WhisperError::ModelNotFound(format!(
+                    "unknown model size '{spec}'; use one of: {}",
+                    KNOWN_SIZES
+                        .iter()
+                        .map(|(n, _)| *n)
+                        .collect::<Vec<_>>()
+                        .join(", ")
+                ))
+            })?;
+
+        tracing::info!(model = %spec, filename = %filename, "fetching Whisper model via hf-hub");
+
+        let api = Api::new().map_err(|e| WhisperError::HfHub(e.to_string()))?;
+        let repo = api.model("ggerganov/whisper.cpp".to_owned());
+        let path = repo
+            .get(filename)
+            .map_err(|e| WhisperError::HfHub(e.to_string()))?;
+
+        Ok(path.to_string_lossy().to_string())
+    }
+
+    /// Decode arbitrary audio bytes to 16 kHz mono f32 samples for Whisper.
+    ///
+    /// Uses symphonia so ogg/opus, mp3, flac, wav, etc. all work without manual
+    /// format detection.
+    fn decode_to_f32(audio: &[u8]) -> Result<Vec<f32>, WhisperError> {
+        use symphonia::core::codecs::DecoderOptions;
+        use symphonia::core::formats::FormatOptions;
+        use symphonia::core::io::MediaSourceStream;
+        use symphonia::core::meta::MetadataOptions;
+        use symphonia::core::probe::Hint;
+
+        let cursor = std::io::Cursor::new(audio.to_vec());
+        let mss = MediaSourceStream::new(Box::new(cursor), Default::default());
+
+        let probed = symphonia::default::get_probe()
+            .format(
+                &Hint::new(),
+                mss,
+                &FormatOptions::default(),
+                &MetadataOptions::default(),
+            )
+            .map_err(|e| WhisperError::Decode(e.to_string()))?;
+
+        let mut format = probed.format;
+        let track = format
+            .tracks()
+            .iter()
+            .find(|t| {
+                t.codec_params.codec != symphonia::core::codecs::CODEC_TYPE_NULL
+            })
+            .ok_or_else(|| WhisperError::Decode("no audio track found".into()))?
+            .clone();
+
+        let mut decoder = symphonia::default::get_codecs()
+            .make(&track.codec_params, &DecoderOptions::default())
+            .map_err(|e| WhisperError::Decode(e.to_string()))?;
+
+        let track_id = track.id;
+        let sample_rate = track.codec_params.sample_rate.unwrap_or(16000);
+        let channels = track
+            .codec_params
+            .channels
+            .map(|c| c.count())
+            .unwrap_or(1);
+
+        let mut raw_samples: Vec<f32> = Vec::new();
+
+        loop {
+            let packet = match format.next_packet() {
+                Ok(p) => p,
+                Err(symphonia::core::errors::Error::IoError(_)) => break,
+                Err(symphonia::core::errors::Error::ResetRequired) => break,
+                Err(e) => return Err(WhisperError::Decode(e.to_string())),
+            };
+
+            if packet.track_id() != track_id {
+                continue;
+            }
+
+            let decoded = decoder
+                .decode(&packet)
+                .map_err(|e| WhisperError::Decode(e.to_string()))?;
+
+            // Convert to f32 mono using a sample-converting audio buffer.
+            use symphonia::core::audio::{AudioBuffer, Signal as _};
+
+            let mut f32_buf: AudioBuffer<f32> = AudioBuffer::new(
+                decoded.capacity() as u64,
+                decoded.spec().clone(),
+            );
+            decoded.convert(&mut f32_buf);
+
+            // Mix down to mono.
+            let frames = f32_buf.frames();
+            for frame in 0..frames {
+                let mut sum = 0f32;
+                for ch in 0..channels {
+                    sum += f32_buf.chan(ch)[frame];
+                }
+                raw_samples.push(sum / channels as f32);
+            }
+        }
+
+        // Resample to 16 kHz if needed.
+        if sample_rate != 16000 {
+            raw_samples = resample(raw_samples, sample_rate, 16000);
+        }
+
+        Ok(raw_samples)
+    }
+
+    /// Simple linear resampler (good enough for speech; not for music).
+    fn resample(samples: Vec<f32>, from_hz: u32, to_hz: u32) -> Vec<f32> {
+        if from_hz == to_hz {
+            return samples;
+        }
+        let ratio = from_hz as f64 / to_hz as f64;
+        let out_len = (samples.len() as f64 / ratio) as usize;
+        let mut out = Vec::with_capacity(out_len);
+        for i in 0..out_len {
+            let pos = i as f64 * ratio;
+            let idx = pos as usize;
+            let frac = (pos - idx as f64) as f32;
+            let a = samples.get(idx).copied().unwrap_or(0.0);
+            let b = samples.get(idx + 1).copied().unwrap_or(0.0);
+            out.push(a + frac * (b - a));
+        }
+        out
+    }
+}

From 3f1f3e3ee548867d13675195e624061177a86c4f Mon Sep 17 00:00:00 2001
From: Marenz <github@supradigital.org>
Date: Sat, 21 Feb 2026 13:20:17 +0100
Subject: [PATCH 2/3] Enable Vulkan GPU backend and Ogg/Opus decode for local
 Whisper STT

---
 Cargo.lock | 32 +++++++++++++++++++++++
 Cargo.toml |  6 +++--
 src/stt.rs | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 111 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index a41956207..1846f49df 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -621,6 +621,17 @@ version = "1.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0"
 
+[[package]]
+name = "audiopus_sys"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62314a1546a2064e033665d658e88c620a62904be945f8147e6b16c3db9f8651"
+dependencies = [
+ "cmake",
+ "log",
+ "pkg-config",
+]
+
 [[package]]
 name = "autocfg"
 version = "1.5.0"
@@ -5800,6 +5811,15 @@ dependencies = [
  "web-time",
 ]
 
+[[package]]
+name = "ogg"
+version = "0.9.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fdab8dcd8d4052eaacaf8fb07a3ccd9a6e26efadb42878a413c68fc4af1dee2b"
+dependencies = [
+ "byteorder",
+]
+
 [[package]]
 name = "once_cell"
 version = "1.21.3"
@@ -5996,6 +6016,15 @@ version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d"
 
+[[package]]
+name = "opus"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4d3809943dff6fbad5f0484449ea26bdb9cb7d8efdf26ed50d3c7f227f69eb5c"
+dependencies = [
+ "audiopus_sys",
+]
+
 [[package]]
 name = "ordered-float"
 version = "5.1.0"
@@ -8032,11 +8061,13 @@ dependencies = [
  "mime_guess",
  "minijinja",
  "notify",
+ "ogg",
  "open",
  "opentelemetry",
  "opentelemetry-otlp",
  "opentelemetry-semantic-conventions",
  "opentelemetry_sdk",
+ "opus",
  "pdf-extract",
  "pin-project",
  "prometheus",
@@ -10104,6 +10135,7 @@ version = "0.15.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "71ea5d2401f30f51d08126a2d133fee4c1955136519d7ac6cf6f5ac0a91e6bc8"
 dependencies = [
+ "libc",
  "whisper-rs-sys",
 ]
 
diff --git a/Cargo.toml b/Cargo.toml
index f39ec2b5d..5c9f581fb 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -134,15 +134,17 @@ tempfile = "3"
 
 # Prometheus metrics (optional, behind "metrics" feature)
 prometheus = { version = "0.13", optional = true }
-whisper-rs = { version = "0.15", optional = true }
+whisper-rs = { version = "0.15", optional = true, features = ["vulkan"] }
 hf-hub = { version = "0.5", optional = true }
 symphonia = { version = "0.5", features = ["mp3", "aac", "flac", "ogg", "wav", "isomp4"], optional = true }
+ogg = { version = "0.9", optional = true }
+opus = { version = "0.3", optional = true }
 pdf-extract = "0.10.0"
 open = "5.3.3"
 urlencoding = "2.1.3"
 
 [features]
-stt-whisper = ["dep:whisper-rs", "dep:hf-hub", "dep:symphonia"]
+stt-whisper = ["dep:whisper-rs", "dep:hf-hub", "dep:symphonia", "dep:ogg", "dep:opus"]
 metrics = ["dep:prometheus"]
 
 [lints.clippy]
diff --git a/src/stt.rs b/src/stt.rs
index 2e0e5a8cc..d07afa400 100644
--- a/src/stt.rs
+++ b/src/stt.rs
@@ -165,9 +165,13 @@ mod local {
 
     /// Decode arbitrary audio bytes to 16 kHz mono f32 samples for Whisper.
     ///
-    /// Uses symphonia so ogg/opus, mp3, flac, wav, etc. all work without manual
-    /// format detection.
+    /// Ogg/Opus (Telegram voice messages) is handled directly via the `ogg` +
+    /// `opus` crates. Everything else falls through to symphonia.
     fn decode_to_f32(audio: &[u8]) -> Result<Vec<f32>, WhisperError> {
+        if is_ogg_opus(audio) {
+            return decode_ogg_opus(audio);
+        }
+
         use symphonia::core::codecs::DecoderOptions;
         use symphonia::core::formats::FormatOptions;
         use symphonia::core::io::MediaSourceStream;
@@ -254,6 +258,75 @@ mod local {
         Ok(raw_samples)
     }
 
+    /// Check if the audio is an Ogg container with an Opus stream.
+    fn is_ogg_opus(audio: &[u8]) -> bool {
+        // OggS capture pattern at offset 0, and OpusHead magic at offset 28
+        // (first packet of the first logical stream).
+        audio.starts_with(b"OggS") && audio.len() > 36 && &audio[28..36] == b"OpusHead"
+    }
+
+    /// Decode Ogg/Opus audio to 16 kHz mono f32 samples.
+    fn decode_ogg_opus(audio: &[u8]) -> Result<Vec<f32>, WhisperError> {
+        use ogg::reading::PacketReader;
+
+        let cursor = std::io::Cursor::new(audio);
+        let mut reader = PacketReader::new(cursor);
+
+        // Skip the OpusHead and OpusTags header packets.
+        let mut header_packets = 0;
+        let mut decoder: Option<opus::Decoder> = None;
+        let mut sample_rate = 48000u32;
+        let mut channels = 1usize;
+        let mut samples: Vec<f32> = Vec::new();
+
+        while let Ok(Some(packet)) = reader.read_packet() {
+            if header_packets < 2 {
+                if header_packets == 0 {
+                    // Parse OpusHead to get channel count and pre-skip.
+                    if packet.data.len() >= 11 && &packet.data[0..8] == b"OpusHead" {
+                        channels = packet.data[9] as usize;
+                        // Output sample rate is always 48000 for libopus.
+                        sample_rate = 48000;
+                    }
+                    decoder = Some(
+                        opus::Decoder::new(sample_rate, if channels == 2 {
+                            opus::Channels::Stereo
+                        } else {
+                            opus::Channels::Mono
+                        })
+                        .map_err(|e| WhisperError::Decode(e.to_string()))?,
+                    );
+                }
+                header_packets += 1;
+                continue;
+            }
+
+            let dec = decoder.as_mut().unwrap();
+            // Max Opus frame: 120ms at 48kHz = 5760 samples per channel.
+            let max_samples = 5760 * channels;
+            let mut pcm = vec![0f32; max_samples];
+            let n = dec
+                .decode_float(&packet.data, &mut pcm, false)
+                .map_err(|e| WhisperError::Decode(e.to_string()))?;
+
+            // Mix down to mono.
+            if channels == 1 {
+                samples.extend_from_slice(&pcm[..n]);
+            } else {
+                for frame in 0..n {
+                    let mut sum = 0f32;
+                    for ch in 0..channels {
+                        sum += pcm[frame * channels + ch];
+                    }
+                    samples.push(sum / channels as f32);
+                }
+            }
+        }
+
+        // Resample from 48 kHz to 16 kHz.
+        Ok(resample(samples, sample_rate, 16000))
+    }
+
     /// Simple linear resampler (good enough for speech; not for music).
     fn resample(samples: Vec<f32>, from_hz: u32, to_hz: u32) -> Vec<f32> {
         if from_hz == to_hz {

From aeea2a14b4139cc1d830010dbbd5932da9e53e66 Mon Sep 17 00:00:00 2001
From: Marenz <github@supradigital.org>
Date: Sat, 21 Feb 2026 13:36:15 +0100
Subject: [PATCH 3/3] docs: document local Whisper STT backend in README

---
 README.md | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/README.md b/README.md
index 82c98caf9..5bd52cf01 100644
--- a/README.md
+++ b/README.md
@@ -193,6 +193,30 @@ channel = "my-provider/my-model"
 
 Additional built-in providers include **NVIDIA**, **MiniMax**, **Moonshot AI (Kimi)**, and **Z.AI Coding Plan** — configure with `nvidia_key`, `minimax_key`, `moonshot_key`, or `zai_coding_plan_key` in `[llm]`.
 
+### Voice Transcription
+
+Audio attachments (voice messages, audio files) are transcribed before being passed to the channel. Set `routing.voice` to choose the backend:
+
+**Provider-based** — route through any configured LLM provider that supports audio input:
+
+```toml
+[defaults.routing]
+voice = "openai/whisper-1"
+```
+
+**Local Whisper** (`stt-whisper` feature, requires `--features stt-whisper` at build time) — run inference locally via [whisper-rs](https://codeberg.org/tazz4843/whisper-rs), no API call needed:
+
+```toml
+[defaults.routing]
+voice = "whisper-local://small"
+```
+
+The model is downloaded automatically from [`ggerganov/whisper.cpp`](https://huggingface.co/ggerganov/whisper.cpp) on first use and cached in `~/.cache/huggingface/hub`. Supported size names: `tiny`, `tiny.en`, `base`, `base.en`, `small`, `small.en`, `medium`, `medium.en`, `large`, `large-v1`, `large-v2`, `large-v3`. An absolute path to a GGML model file also works.
+
+GPU acceleration via Vulkan is enabled automatically when a compatible device is detected. The loaded model is cached for the process lifetime — restart to switch models.
+
+Ogg/Opus audio (Telegram voice messages) is decoded natively. All other formats are handled via symphonia.
+
 ### Skills
 
 Extensible skill system integrated with [skills.sh](https://skills.sh):