From f7140455cbcd88698e96f770591b2a0674cfd87a Mon Sep 17 00:00:00 2001 From: Marenz Date: Sat, 21 Feb 2026 11:54:15 +0100 Subject: [PATCH 1/3] Add local Whisper STT backend via whisper-rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When routing.voice = "whisper-local://", audio attachments are transcribed locally instead of via the LLM provider HTTP path. is either: - A known size name (tiny/base/small/medium/large) — fetched from ggerganov/whisper.cpp on HuggingFace via hf-hub, using the existing HF cache if already present - An absolute path to a GGML model file The WhisperContext is loaded once and cached in a OnceLock for the process lifetime. Audio decoding (ogg, opus, mp3, flac, wav, m4a) is handled by symphonia with linear resampling to 16 kHz mono f32. All three deps (whisper-rs, hf-hub, symphonia) are optional behind the stt-whisper feature flag. --- Cargo.lock | 409 ++++++++++++++++++++++++++++++++++++++++++- Cargo.toml | 4 + src/agent/channel.rs | 26 +++ src/lib.rs | 2 + src/stt.rs | 275 +++++++++++++++++++++++++++++ 5 files changed, 710 insertions(+), 6 deletions(-) create mode 100644 src/stt.rs diff --git a/Cargo.lock b/Cargo.lock index 027a6432e..a41956207 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -776,6 +776,26 @@ dependencies = [ "num-traits", ] +[[package]] +name = "bindgen" +version = "0.71.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f58bf3d7db68cfbac37cfc485a8d711e87e064c3d0fe0435b92f7a407f9d6b3" +dependencies = [ + "bitflags 2.10.0", + "cexpr", + "clang-sys", + "itertools 0.13.0", + "log", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "syn 2.0.114", +] + [[package]] name = "bit_field" version = "0.10.3" @@ -1105,6 +1125,15 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4f4c707c6a209cbe82d10abd08e1ea8995e9ea937d2550646e02798948992be0" +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom 7.1.3", +] + [[package]] name = "cff-parser" version = "0.1.0" @@ -1223,6 +1252,17 @@ dependencies = [ "inout", ] +[[package]] +name = "clang-sys" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" +dependencies = [ + "glob", + "libc", + "libloading", +] + [[package]] name = "clap" version = "4.5.58" @@ -1377,6 +1417,19 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "console" +version = "0.16.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03e45a4a8926227e4197636ba97a9fc9b00477e9f4bd711395687c5f0734bec4" +dependencies = [ + "encode_unicode", + "libc", + "once_cell", + "unicode-width", + "windows-sys 0.61.2", +] + [[package]] name = "const-oid" version = "0.9.6" @@ -1433,6 +1486,35 @@ dependencies = [ "unicode-segmentation", ] +[[package]] +name = "cookie" +version = "0.18.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ddef33a339a91ea89fb53151bd0a4689cfce27055c291dfa69945475d22c747" +dependencies = [ + "percent-encoding", + "time", + "version_check", +] + +[[package]] +name = "cookie_store" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15b2c103cf610ec6cae3da84a766285b42fd16aad564758459e6ecf128c75206" +dependencies = [ + "cookie", + "document-features", + "idna", + "indexmap 2.13.0", + "log", + "serde", + "serde_derive", + "serde_json", + "time", + "url", +] + [[package]] name = "core-foundation" version = "0.9.4" @@ -2481,7 +2563,7 @@ version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "658bce805d770f407bc62102fca7c2c64ceef2fbcb2b8bd19d2765ce093980de" dependencies = [ - "console", + "console 0.15.11", "shell-words", "tempfile", "thiserror 1.0.69", @@ -2553,6 +2635,15 @@ dependencies = [ "const-random", ] +[[package]] +name = "document-features" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4b8a88685455ed29a21542a33abd9cb6510b6b129abadabdcef0f4c55bc8f61" +dependencies = [ + "litrs", +] + [[package]] name = "dotenvy" version = "0.15.7" @@ -2793,6 +2884,12 @@ dependencies = [ "zune-inflate", ] +[[package]] +name = "extended" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af9673d8203fcb076b19dfd17e38b3d4ae9f44959416ea532ce72415a6020365" + [[package]] name = "fast-float2" version = "0.2.3" @@ -2812,7 +2909,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04c269a76bfc6cea69553b7d040acb16c793119cebd97c756d21e08d0f075ff8" dependencies = [ "anyhow", - "hf-hub", + "hf-hub 0.4.3", "image", "ndarray", "ort", @@ -3470,7 +3567,7 @@ checksum = "629d8f3bbeda9d148036d6b0de0a3ab947abd08ce90626327fc3547a49d59d97" dependencies = [ "dirs", "http", - "indicatif", + "indicatif 0.17.11", "libc", "log", "native-tls", @@ -3479,10 +3576,34 @@ dependencies = [ "serde", "serde_json", "thiserror 2.0.18", - "ureq", + "ureq 2.12.1", "windows-sys 0.60.2", ] +[[package]] +name = "hf-hub" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aef3982638978efa195ff11b305f51f1f22f4f0a6cabee7af79b383ebee6a213" +dependencies = [ + "dirs", + "futures", + "http", + "indicatif 0.18.4", + "libc", + "log", + "native-tls", + "num_cpus", + "rand 0.9.2", + "reqwest", + "serde", + "serde_json", + "thiserror 2.0.18", + "tokio", + "ureq 3.2.0", + "windows-sys 0.61.2", +] + [[package]] name = "hkdf" version = "0.12.4" @@ -3986,13 +4107,26 @@ version = "0.17.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235" dependencies = [ - "console", + "console 0.15.11", "number_prefix", "portable-atomic", "unicode-width", "web-time", ] +[[package]] +name = "indicatif" +version = "0.18.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25470f23803092da7d239834776d653104d551bc4d7eacaf31e6837854b8e9eb" +dependencies = [ + "console 0.16.2", + "portable-atomic", + "unicode-width", + "unit-prefix", + "web-time", +] + [[package]] name = "indoc" version = "2.0.7" @@ -4915,6 +5049,16 @@ dependencies = [ "cc", ] +[[package]] +name = "libloading" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" +dependencies = [ + "cfg-if", + "windows-link 0.2.1", +] + [[package]] name = "libm" version = "0.2.16" @@ -4961,6 +5105,12 @@ version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" +[[package]] +name = "litrs" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11d3d7f243d5c5a8b9bb5d6dd2b1602c0cb0b9db1621bafc7ed66e35ff9fe092" + [[package]] name = "lock_api" version = "0.4.14" @@ -5886,7 +6036,7 @@ dependencies = [ "pkg-config", "sha2", "tar", - "ureq", + "ureq 2.12.1", ] [[package]] @@ -7873,6 +8023,7 @@ dependencies = [ "dirs", "fastembed", "futures", + "hf-hub 0.5.0", "ignore", "indoc", "lance-index", @@ -7904,6 +8055,7 @@ dependencies = [ "sha2", "slack-morphism", "sqlx", + "symphonia", "teloxide", "tempfile", "thiserror 2.0.18", @@ -7920,6 +8072,7 @@ dependencies = [ "twitch-irc", "urlencoding", "uuid", + "whisper-rs", "zip", ] @@ -8271,6 +8424,178 @@ version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" +[[package]] +name = "symphonia" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5773a4c030a19d9bfaa090f49746ff35c75dfddfa700df7a5939d5e076a57039" +dependencies = [ + "lazy_static", + "symphonia-bundle-flac", + "symphonia-bundle-mp3", + "symphonia-codec-aac", + "symphonia-codec-adpcm", + "symphonia-codec-pcm", + "symphonia-codec-vorbis", + "symphonia-core", + "symphonia-format-isomp4", + "symphonia-format-mkv", + "symphonia-format-ogg", + "symphonia-format-riff", + "symphonia-metadata", +] + +[[package]] +name = "symphonia-bundle-flac" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c91565e180aea25d9b80a910c546802526ffd0072d0b8974e3ebe59b686c9976" +dependencies = [ + "log", + "symphonia-core", + "symphonia-metadata", + "symphonia-utils-xiph", +] + +[[package]] +name = "symphonia-bundle-mp3" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4872dd6bb56bf5eac799e3e957aa1981086c3e613b27e0ac23b176054f7c57ed" +dependencies = [ + "lazy_static", + "log", + "symphonia-core", + "symphonia-metadata", +] + +[[package]] +name = "symphonia-codec-aac" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c263845aa86881416849c1729a54c7f55164f8b96111dba59de46849e73a790" +dependencies = [ + "lazy_static", + "log", + "symphonia-core", +] + +[[package]] +name = "symphonia-codec-adpcm" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dddc50e2bbea4cfe027441eece77c46b9f319748605ab8f3443350129ddd07f" +dependencies = [ + "log", + "symphonia-core", +] + +[[package]] +name = "symphonia-codec-pcm" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e89d716c01541ad3ebe7c91ce4c8d38a7cf266a3f7b2f090b108fb0cb031d95" +dependencies = [ + "log", + "symphonia-core", +] + +[[package]] +name = "symphonia-codec-vorbis" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f025837c309cd69ffef572750b4a2257b59552c5399a5e49707cc5b1b85d1c73" +dependencies = [ + "log", + "symphonia-core", + "symphonia-utils-xiph", +] + +[[package]] +name = "symphonia-core" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea00cc4f79b7f6bb7ff87eddc065a1066f3a43fe1875979056672c9ef948c2af" +dependencies = [ + "arrayvec", + "bitflags 1.3.2", + "bytemuck", + "lazy_static", + "log", +] + +[[package]] +name = "symphonia-format-isomp4" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "243739585d11f81daf8dac8d9f3d18cc7898f6c09a259675fc364b382c30e0a5" +dependencies = [ + "encoding_rs", + "log", + "symphonia-core", + "symphonia-metadata", + "symphonia-utils-xiph", +] + +[[package]] +name = "symphonia-format-mkv" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "122d786d2c43a49beb6f397551b4a050d8229eaa54c7ddf9ee4b98899b8742d0" +dependencies = [ + "lazy_static", + "log", + "symphonia-core", + "symphonia-metadata", + "symphonia-utils-xiph", +] + +[[package]] +name = "symphonia-format-ogg" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b4955c67c1ed3aa8ae8428d04ca8397fbef6a19b2b051e73b5da8b1435639cb" +dependencies = [ + "log", + "symphonia-core", + "symphonia-metadata", + "symphonia-utils-xiph", +] + +[[package]] +name = "symphonia-format-riff" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2d7c3df0e7d94efb68401d81906eae73c02b40d5ec1a141962c592d0f11a96f" +dependencies = [ + "extended", + "log", + "symphonia-core", + "symphonia-metadata", +] + +[[package]] +name = "symphonia-metadata" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36306ff42b9ffe6e5afc99d49e121e0bd62fe79b9db7b9681d48e29fa19e6b16" +dependencies = [ + "encoding_rs", + "lazy_static", + "log", + "symphonia-core", +] + +[[package]] +name = "symphonia-utils-xiph" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee27c85ab799a338446b68eec77abf42e1a6f1bb490656e121c6e27bfbab9f16" +dependencies = [ + "symphonia-core", + "symphonia-metadata", +] + [[package]] name = "syn" version = "1.0.109" @@ -9386,6 +9711,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" +[[package]] +name = "unit-prefix" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81e544489bf3d8ef66c953931f56617f423cd4b5494be343d9b9d3dda037b9a3" + [[package]] name = "universal-hash" version = "0.5.1" @@ -9422,6 +9753,42 @@ dependencies = [ "webpki-roots 0.26.11", ] +[[package]] +name = "ureq" +version = "3.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdc97a28575b85cfedf2a7e7d3cc64b3e11bd8ac766666318003abbacc7a21fc" +dependencies = [ + "base64 0.22.1", + "cookie_store", + "der", + "flate2", + "log", + "native-tls", + "percent-encoding", + "rustls 0.23.36", + "rustls-pki-types", + "serde", + "serde_json", + "socks", + "ureq-proto", + "utf-8", + "webpki-root-certs", + "webpki-roots 1.0.6", +] + +[[package]] +name = "ureq-proto" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d81f9efa9df032be5934a46a068815a10a042b494b6a58cb0a1a97bb5467ed6f" +dependencies = [ + "base64 0.22.1", + "http", + "httparse", + "log", +] + [[package]] name = "url" version = "2.5.8" @@ -9681,6 +10048,15 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "webpki-root-certs" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "804f18a4ac2676ffb4e8b5b5fa9ae38af06df08162314f96a68d2a363e21a8ca" +dependencies = [ + "rustls-pki-types", +] + [[package]] name = "webpki-roots" version = "0.25.4" @@ -9722,6 +10098,27 @@ dependencies = [ "winsafe", ] +[[package]] +name = "whisper-rs" +version = "0.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71ea5d2401f30f51d08126a2d133fee4c1955136519d7ac6cf6f5ac0a91e6bc8" +dependencies = [ + "whisper-rs-sys", +] + +[[package]] +name = "whisper-rs-sys" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e2a6e06e7ac7b8f53c53a5f50bb0bc823ba69b63ecd887339f807a5598bbd2" +dependencies = [ + "bindgen", + "cfg-if", + "cmake", + "fs_extra", +] + [[package]] name = "whoami" version = "1.6.1" diff --git a/Cargo.toml b/Cargo.toml index fbd05795a..f39ec2b5d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -134,11 +134,15 @@ tempfile = "3" # Prometheus metrics (optional, behind "metrics" feature) prometheus = { version = "0.13", optional = true } +whisper-rs = { version = "0.15", optional = true } +hf-hub = { version = "0.5", optional = true } +symphonia = { version = "0.5", features = ["mp3", "aac", "flac", "ogg", "wav", "isomp4"], optional = true } pdf-extract = "0.10.0" open = "5.3.3" urlencoding = "2.1.3" [features] +stt-whisper = ["dep:whisper-rs", "dep:hf-hub", "dep:symphonia"] metrics = ["dep:prometheus"] [lints.clippy] diff --git a/src/agent/channel.rs b/src/agent/channel.rs index f6e419e3b..5bb76de30 100644 --- a/src/agent/channel.rs +++ b/src/agent/channel.rs @@ -1841,6 +1841,32 @@ async fn transcribe_audio_attachment( )); } + // Local Whisper backend — bypass the LLM provider path entirely. + #[cfg(feature = "stt-whisper")] + if let Some(model_spec) = voice_model.strip_prefix("whisper-local://") { + let transcript = match crate::stt::transcribe(model_spec, &bytes).await { + Ok(text) if text.is_empty() => { + tracing::warn!(filename = %attachment.filename, "local Whisper returned empty transcript"); + return UserContent::text(format!( + "[Audio transcription returned empty text for {}]", + attachment.filename + )); + } + Ok(text) => text, + Err(error) => { + tracing::warn!(%error, filename = %attachment.filename, "local Whisper transcription failed"); + return UserContent::text(format!( + "[Audio transcription failed for {}: {}]", + attachment.filename, error + )); + } + }; + return UserContent::text(format!( + "\n{}\n", + attachment.filename, attachment.mime_type, transcript + )); + } + let (provider_id, model_name) = match deps.llm_manager.resolve_model(voice_model) { Ok(parts) => parts, Err(error) => { diff --git a/src/lib.rs b/src/lib.rs index ed80aed32..be4eb274d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -22,6 +22,8 @@ pub mod skills; #[cfg(feature = "metrics")] pub mod telemetry; pub mod tools; +#[cfg(feature = "stt-whisper")] +pub mod stt; pub mod update; pub use error::{Error, Result}; diff --git a/src/stt.rs b/src/stt.rs new file mode 100644 index 000000000..2e0e5a8cc --- /dev/null +++ b/src/stt.rs @@ -0,0 +1,275 @@ +//! Local Whisper speech-to-text via whisper-rs. +//! +//! Only compiled when the `stt-whisper` feature is enabled. +//! Exposed as a single async `transcribe` function that lazily loads and caches +//! the model context for the lifetime of the process. + +#[cfg(feature = "stt-whisper")] +pub use local::transcribe; + +#[cfg(feature = "stt-whisper")] +mod local { + use std::sync::OnceLock; + + use hf_hub::api::sync::Api; + use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextParameters}; + + /// Known model size names and their GGML filenames on `ggerganov/whisper.cpp`. + const KNOWN_SIZES: &[(&str, &str)] = &[ + ("tiny", "ggml-tiny.bin"), + ("tiny.en", "ggml-tiny.en.bin"), + ("base", "ggml-base.bin"), + ("base.en", "ggml-base.en.bin"), + ("small", "ggml-small.bin"), + ("small.en", "ggml-small.en.bin"), + ("medium", "ggml-medium.bin"), + ("medium.en", "ggml-medium.en.bin"), + ("large", "ggml-large-v3.bin"), + ("large-v1", "ggml-large-v1.bin"), + ("large-v2", "ggml-large-v2.bin"), + ("large-v3", "ggml-large-v3.bin"), + ]; + + /// Cached (model_spec, WhisperContext) — one per process. + /// + /// If the user changes `routing.voice` at runtime we just keep using the + /// already-loaded model; a restart is required to switch models. + static CONTEXT: OnceLock<(String, WhisperContext)> = OnceLock::new(); + + #[derive(Debug, thiserror::Error)] + pub enum WhisperError { + #[error("model not found and could not be downloaded: {0}")] + ModelNotFound(String), + #[error("hf-hub error: {0}")] + HfHub(String), + #[error("failed to load whisper model: {0}")] + Load(String), + #[error("failed to create whisper state: {0}")] + State(String), + #[error("transcription failed: {0}")] + Transcription(String), + #[error("audio decode error: {0}")] + Decode(String), + } + + /// Transcribe raw audio bytes using the local Whisper model. + /// + /// `model_spec` is the part after `whisper-local://`: + /// - A known size name (`small`, `medium`, `large`, …) — downloaded from HF + /// into the HF cache on first use. + /// - An absolute path (`/path/to/ggml-small.bin`) — loaded directly. + pub async fn transcribe(model_spec: &str, audio: &[u8]) -> Result { + let model_spec = model_spec.to_owned(); + let audio = audio.to_vec(); + + // Whisper inference is CPU-bound and blocking — run on a thread pool. + tokio::task::spawn_blocking(move || transcribe_blocking(&model_spec, &audio)) + .await + .map_err(|e| WhisperError::Transcription(e.to_string()))? + } + + fn transcribe_blocking(model_spec: &str, audio: &[u8]) -> Result { + let ctx = get_or_load_context(model_spec)?; + + let mut state = ctx + .create_state() + .map_err(|e| WhisperError::State(e.to_string()))?; + + let samples = decode_to_f32(audio)?; + + let mut params = FullParams::new(SamplingStrategy::Greedy { best_of: 1 }); + params.set_language(Some("auto")); + params.set_print_progress(false); + params.set_print_realtime(false); + params.set_print_timestamps(false); + + state + .full(params, &samples) + .map_err(|e| WhisperError::Transcription(e.to_string()))?; + + let n = state.full_n_segments(); + let mut parts = Vec::with_capacity(n as usize); + for i in 0..n { + if let Some(segment) = state.get_segment(i) { + if let Ok(text) = segment.to_str() { + let trimmed = text.trim(); + if !trimmed.is_empty() { + parts.push(trimmed.to_owned()); + } + } + } + } + + Ok(parts.join(" ")) + } + + /// Return the cached context, loading it first if necessary. + fn get_or_load_context(model_spec: &str) -> Result<&'static WhisperContext, WhisperError> { + if let Some((_, ctx)) = CONTEXT.get() { + return Ok(ctx); + } + + let model_path = resolve_model_path(model_spec)?; + + tracing::info!(model_path = %model_path, "loading local Whisper model"); + + let params = WhisperContextParameters::default(); + let ctx = WhisperContext::new_with_params(&model_path, params) + .map_err(|e| WhisperError::Load(e.to_string()))?; + + let _ = CONTEXT.set((model_spec.to_owned(), ctx)); + + tracing::info!(model_path = %model_path, "Whisper model loaded and cached"); + + Ok(&CONTEXT.get().unwrap().1) + } + + /// Resolve a model spec to an absolute path on disk, downloading via hf-hub if needed. + fn resolve_model_path(spec: &str) -> Result { + // Absolute path — use directly. + if spec.starts_with('/') { + if std::path::Path::new(spec).exists() { + return Ok(spec.to_owned()); + } + return Err(WhisperError::ModelNotFound(format!( + "model file not found: {spec}" + ))); + } + + // Known size name — fetch via hf-hub (uses HF_HOME cache, downloads if missing). + let filename = KNOWN_SIZES + .iter() + .find(|(name, _)| *name == spec) + .map(|(_, file)| *file) + .ok_or_else(|| { + WhisperError::ModelNotFound(format!( + "unknown model size '{spec}'; use one of: {}", + KNOWN_SIZES + .iter() + .map(|(n, _)| *n) + .collect::>() + .join(", ") + )) + })?; + + tracing::info!(model = %spec, filename = %filename, "fetching Whisper model via hf-hub"); + + let api = Api::new().map_err(|e| WhisperError::HfHub(e.to_string()))?; + let repo = api.model("ggerganov/whisper.cpp".to_owned()); + let path = repo + .get(filename) + .map_err(|e| WhisperError::HfHub(e.to_string()))?; + + Ok(path.to_string_lossy().to_string()) + } + + /// Decode arbitrary audio bytes to 16 kHz mono f32 samples for Whisper. + /// + /// Uses symphonia so ogg/opus, mp3, flac, wav, etc. all work without manual + /// format detection. + fn decode_to_f32(audio: &[u8]) -> Result, WhisperError> { + use symphonia::core::codecs::DecoderOptions; + use symphonia::core::formats::FormatOptions; + use symphonia::core::io::MediaSourceStream; + use symphonia::core::meta::MetadataOptions; + use symphonia::core::probe::Hint; + + let cursor = std::io::Cursor::new(audio.to_vec()); + let mss = MediaSourceStream::new(Box::new(cursor), Default::default()); + + let probed = symphonia::default::get_probe() + .format( + &Hint::new(), + mss, + &FormatOptions::default(), + &MetadataOptions::default(), + ) + .map_err(|e| WhisperError::Decode(e.to_string()))?; + + let mut format = probed.format; + let track = format + .tracks() + .iter() + .find(|t| { + t.codec_params.codec != symphonia::core::codecs::CODEC_TYPE_NULL + }) + .ok_or_else(|| WhisperError::Decode("no audio track found".into()))? + .clone(); + + let mut decoder = symphonia::default::get_codecs() + .make(&track.codec_params, &DecoderOptions::default()) + .map_err(|e| WhisperError::Decode(e.to_string()))?; + + let track_id = track.id; + let sample_rate = track.codec_params.sample_rate.unwrap_or(16000); + let channels = track + .codec_params + .channels + .map(|c| c.count()) + .unwrap_or(1); + + let mut raw_samples: Vec = Vec::new(); + + loop { + let packet = match format.next_packet() { + Ok(p) => p, + Err(symphonia::core::errors::Error::IoError(_)) => break, + Err(symphonia::core::errors::Error::ResetRequired) => break, + Err(e) => return Err(WhisperError::Decode(e.to_string())), + }; + + if packet.track_id() != track_id { + continue; + } + + let decoded = decoder + .decode(&packet) + .map_err(|e| WhisperError::Decode(e.to_string()))?; + + // Convert to f32 mono using a sample-converting audio buffer. + use symphonia::core::audio::{AudioBuffer, Signal as _}; + + let mut f32_buf: AudioBuffer = AudioBuffer::new( + decoded.capacity() as u64, + decoded.spec().clone(), + ); + decoded.convert(&mut f32_buf); + + // Mix down to mono. + let frames = f32_buf.frames(); + for frame in 0..frames { + let mut sum = 0f32; + for ch in 0..channels { + sum += f32_buf.chan(ch)[frame]; + } + raw_samples.push(sum / channels as f32); + } + } + + // Resample to 16 kHz if needed. + if sample_rate != 16000 { + raw_samples = resample(raw_samples, sample_rate, 16000); + } + + Ok(raw_samples) + } + + /// Simple linear resampler (good enough for speech; not for music). + fn resample(samples: Vec, from_hz: u32, to_hz: u32) -> Vec { + if from_hz == to_hz { + return samples; + } + let ratio = from_hz as f64 / to_hz as f64; + let out_len = (samples.len() as f64 / ratio) as usize; + let mut out = Vec::with_capacity(out_len); + for i in 0..out_len { + let pos = i as f64 * ratio; + let idx = pos as usize; + let frac = (pos - idx as f64) as f32; + let a = samples.get(idx).copied().unwrap_or(0.0); + let b = samples.get(idx + 1).copied().unwrap_or(0.0); + out.push(a + frac * (b - a)); + } + out + } +} From 3f1f3e3ee548867d13675195e624061177a86c4f Mon Sep 17 00:00:00 2001 From: Marenz Date: Sat, 21 Feb 2026 13:20:17 +0100 Subject: [PATCH 2/3] Enable Vulkan GPU backend and Ogg/Opus decode for local Whisper STT --- Cargo.lock | 32 +++++++++++++++++++++++ Cargo.toml | 6 +++-- src/stt.rs | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 111 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a41956207..1846f49df 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -621,6 +621,17 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" +[[package]] +name = "audiopus_sys" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62314a1546a2064e033665d658e88c620a62904be945f8147e6b16c3db9f8651" +dependencies = [ + "cmake", + "log", + "pkg-config", +] + [[package]] name = "autocfg" version = "1.5.0" @@ -5800,6 +5811,15 @@ dependencies = [ "web-time", ] +[[package]] +name = "ogg" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdab8dcd8d4052eaacaf8fb07a3ccd9a6e26efadb42878a413c68fc4af1dee2b" +dependencies = [ + "byteorder", +] + [[package]] name = "once_cell" version = "1.21.3" @@ -5996,6 +6016,15 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" +[[package]] +name = "opus" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d3809943dff6fbad5f0484449ea26bdb9cb7d8efdf26ed50d3c7f227f69eb5c" +dependencies = [ + "audiopus_sys", +] + [[package]] name = "ordered-float" version = "5.1.0" @@ -8032,11 +8061,13 @@ dependencies = [ "mime_guess", "minijinja", "notify", + "ogg", "open", "opentelemetry", "opentelemetry-otlp", "opentelemetry-semantic-conventions", "opentelemetry_sdk", + "opus", "pdf-extract", "pin-project", "prometheus", @@ -10104,6 +10135,7 @@ version = "0.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "71ea5d2401f30f51d08126a2d133fee4c1955136519d7ac6cf6f5ac0a91e6bc8" dependencies = [ + "libc", "whisper-rs-sys", ] diff --git a/Cargo.toml b/Cargo.toml index f39ec2b5d..5c9f581fb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -134,15 +134,17 @@ tempfile = "3" # Prometheus metrics (optional, behind "metrics" feature) prometheus = { version = "0.13", optional = true } -whisper-rs = { version = "0.15", optional = true } +whisper-rs = { version = "0.15", optional = true, features = ["vulkan"] } hf-hub = { version = "0.5", optional = true } symphonia = { version = "0.5", features = ["mp3", "aac", "flac", "ogg", "wav", "isomp4"], optional = true } +ogg = { version = "0.9", optional = true } +opus = { version = "0.3", optional = true } pdf-extract = "0.10.0" open = "5.3.3" urlencoding = "2.1.3" [features] -stt-whisper = ["dep:whisper-rs", "dep:hf-hub", "dep:symphonia"] +stt-whisper = ["dep:whisper-rs", "dep:hf-hub", "dep:symphonia", "dep:ogg", "dep:opus"] metrics = ["dep:prometheus"] [lints.clippy] diff --git a/src/stt.rs b/src/stt.rs index 2e0e5a8cc..d07afa400 100644 --- a/src/stt.rs +++ b/src/stt.rs @@ -165,9 +165,13 @@ mod local { /// Decode arbitrary audio bytes to 16 kHz mono f32 samples for Whisper. /// - /// Uses symphonia so ogg/opus, mp3, flac, wav, etc. all work without manual - /// format detection. + /// Ogg/Opus (Telegram voice messages) is handled directly via the `ogg` + + /// `opus` crates. Everything else falls through to symphonia. fn decode_to_f32(audio: &[u8]) -> Result, WhisperError> { + if is_ogg_opus(audio) { + return decode_ogg_opus(audio); + } + use symphonia::core::codecs::DecoderOptions; use symphonia::core::formats::FormatOptions; use symphonia::core::io::MediaSourceStream; @@ -254,6 +258,75 @@ mod local { Ok(raw_samples) } + /// Check if the audio is an Ogg container with an Opus stream. + fn is_ogg_opus(audio: &[u8]) -> bool { + // OggS capture pattern at offset 0, and OpusHead magic at offset 28 + // (first packet of the first logical stream). + audio.starts_with(b"OggS") && audio.len() > 36 && &audio[28..36] == b"OpusHead" + } + + /// Decode Ogg/Opus audio to 16 kHz mono f32 samples. + fn decode_ogg_opus(audio: &[u8]) -> Result, WhisperError> { + use ogg::reading::PacketReader; + + let cursor = std::io::Cursor::new(audio); + let mut reader = PacketReader::new(cursor); + + // Skip the OpusHead and OpusTags header packets. + let mut header_packets = 0; + let mut decoder: Option = None; + let mut sample_rate = 48000u32; + let mut channels = 1usize; + let mut samples: Vec = Vec::new(); + + while let Ok(Some(packet)) = reader.read_packet() { + if header_packets < 2 { + if header_packets == 0 { + // Parse OpusHead to get channel count and pre-skip. + if packet.data.len() >= 11 && &packet.data[0..8] == b"OpusHead" { + channels = packet.data[9] as usize; + // Output sample rate is always 48000 for libopus. + sample_rate = 48000; + } + decoder = Some( + opus::Decoder::new(sample_rate, if channels == 2 { + opus::Channels::Stereo + } else { + opus::Channels::Mono + }) + .map_err(|e| WhisperError::Decode(e.to_string()))?, + ); + } + header_packets += 1; + continue; + } + + let dec = decoder.as_mut().unwrap(); + // Max Opus frame: 120ms at 48kHz = 5760 samples per channel. + let max_samples = 5760 * channels; + let mut pcm = vec![0f32; max_samples]; + let n = dec + .decode_float(&packet.data, &mut pcm, false) + .map_err(|e| WhisperError::Decode(e.to_string()))?; + + // Mix down to mono. + if channels == 1 { + samples.extend_from_slice(&pcm[..n]); + } else { + for frame in 0..n { + let mut sum = 0f32; + for ch in 0..channels { + sum += pcm[frame * channels + ch]; + } + samples.push(sum / channels as f32); + } + } + } + + // Resample from 48 kHz to 16 kHz. + Ok(resample(samples, sample_rate, 16000)) + } + /// Simple linear resampler (good enough for speech; not for music). fn resample(samples: Vec, from_hz: u32, to_hz: u32) -> Vec { if from_hz == to_hz { From aeea2a14b4139cc1d830010dbbd5932da9e53e66 Mon Sep 17 00:00:00 2001 From: Marenz Date: Sat, 21 Feb 2026 13:36:15 +0100 Subject: [PATCH 3/3] docs: document local Whisper STT backend in README --- README.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/README.md b/README.md index 82c98caf9..5bd52cf01 100644 --- a/README.md +++ b/README.md @@ -193,6 +193,30 @@ channel = "my-provider/my-model" Additional built-in providers include **NVIDIA**, **MiniMax**, **Moonshot AI (Kimi)**, and **Z.AI Coding Plan** — configure with `nvidia_key`, `minimax_key`, `moonshot_key`, or `zai_coding_plan_key` in `[llm]`. +### Voice Transcription + +Audio attachments (voice messages, audio files) are transcribed before being passed to the channel. Set `routing.voice` to choose the backend: + +**Provider-based** — route through any configured LLM provider that supports audio input: + +```toml +[defaults.routing] +voice = "openai/whisper-1" +``` + +**Local Whisper** (`stt-whisper` feature, requires `--features stt-whisper` at build time) — run inference locally via [whisper-rs](https://codeberg.org/tazz4843/whisper-rs), no API call needed: + +```toml +[defaults.routing] +voice = "whisper-local://small" +``` + +The model is downloaded automatically from [`ggerganov/whisper.cpp`](https://huggingface.co/ggerganov/whisper.cpp) on first use and cached in `~/.cache/huggingface/hub`. Supported size names: `tiny`, `tiny.en`, `base`, `base.en`, `small`, `small.en`, `medium`, `medium.en`, `large`, `large-v1`, `large-v2`, `large-v3`. An absolute path to a GGML model file also works. + +GPU acceleration via Vulkan is enabled automatically when a compatible device is detected. The loaded model is cached for the process lifetime — restart to switch models. + +Ogg/Opus audio (Telegram voice messages) is decoded natively. All other formats are handled via symphonia. + ### Skills Extensible skill system integrated with [skills.sh](https://skills.sh):