From cd8e9d4481332782a99c347fb2573385837c2dda Mon Sep 17 00:00:00 2001
From: Douwe Osinga
Date: Fri, 30 Jan 2026 12:25:40 -0500
Subject: [PATCH 1/4] Rejig dictation
---
crates/goose-server/src/openapi.rs | 6 +
crates/goose-server/src/routes/audio.rs | 512 ------------------
crates/goose-server/src/routes/dictation.rs | 412 ++++++++++++++
crates/goose-server/src/routes/mod.rs | 4 +-
ui/desktop/openapi.json | 157 ++++++
ui/desktop/src/api/index.ts | 4 +-
ui/desktop/src/api/sdk.gen.ts | 13 +-
ui/desktop/src/api/types.gen.ts | 121 +++++
ui/desktop/src/components/ChatInput.tsx | 46 +-
.../settings/chat/ChatSettingsSection.tsx | 20 +-
.../settings/dictation/DictationSection.tsx | 5 -
.../settings/dictation/DictationSettings.tsx | 249 +++++++++
.../settings/dictation/ElevenLabsKeyInput.tsx | 128 -----
.../settings/dictation/ProviderInfo.tsx | 41 --
.../settings/dictation/ProviderSelector.tsx | 128 -----
.../dictation/VoiceDictationToggle.tsx | 97 ----
ui/desktop/src/hooks/dictationConstants.ts | 31 --
ui/desktop/src/hooks/useAudioRecorder.ts | 249 +++++++++
ui/desktop/src/hooks/useDictationSettings.ts | 76 ---
ui/desktop/src/hooks/useWhisper.ts | 378 -------------
ui/desktop/src/updates.ts | 1 -
21 files changed, 1237 insertions(+), 1441 deletions(-)
delete mode 100644 crates/goose-server/src/routes/audio.rs
create mode 100644 crates/goose-server/src/routes/dictation.rs
delete mode 100644 ui/desktop/src/components/settings/dictation/DictationSection.tsx
create mode 100644 ui/desktop/src/components/settings/dictation/DictationSettings.tsx
delete mode 100644 ui/desktop/src/components/settings/dictation/ElevenLabsKeyInput.tsx
delete mode 100644 ui/desktop/src/components/settings/dictation/ProviderInfo.tsx
delete mode 100644 ui/desktop/src/components/settings/dictation/ProviderSelector.tsx
delete mode 100644 ui/desktop/src/components/settings/dictation/VoiceDictationToggle.tsx
delete mode 100644 ui/desktop/src/hooks/dictationConstants.ts
create mode 100644 ui/desktop/src/hooks/useAudioRecorder.ts
delete mode 100644 ui/desktop/src/hooks/useDictationSettings.ts
delete mode 100644 ui/desktop/src/hooks/useWhisper.ts
diff --git a/crates/goose-server/src/openapi.rs b/crates/goose-server/src/openapi.rs
index 78158fd2e0c4..fa9f8a09564d 100644
--- a/crates/goose-server/src/openapi.rs
+++ b/crates/goose-server/src/openapi.rs
@@ -412,6 +412,8 @@ derive_utoipa!(Icon as IconSchema);
super::routes::tunnel::stop_tunnel,
super::routes::tunnel::get_tunnel_status,
super::routes::telemetry::send_telemetry_event,
+ super::routes::dictation::transcribe_dictation,
+ super::routes::dictation::get_dictation_config,
),
components(schemas(
super::routes::config_management::UpsertConfigQuery,
@@ -570,6 +572,10 @@ derive_utoipa!(Icon as IconSchema);
goose::goose_apps::CspMetadata,
goose::goose_apps::UiMetadata,
goose::goose_apps::ResourceMetadata,
+ super::routes::dictation::TranscribeRequest,
+ super::routes::dictation::TranscribeResponse,
+ super::routes::dictation::DictationProvider,
+ super::routes::dictation::DictationProviderStatus,
))
)]
pub struct ApiDoc;
diff --git a/crates/goose-server/src/routes/audio.rs b/crates/goose-server/src/routes/audio.rs
deleted file mode 100644
index c0364ef3d768..000000000000
--- a/crates/goose-server/src/routes/audio.rs
+++ /dev/null
@@ -1,512 +0,0 @@
-/// Audio transcription route handler
-///
-/// This module provides endpoints for audio transcription using OpenAI's Whisper API.
-/// The OpenAI API key must be configured in the backend for this to work.
-use crate::routes::errors::ErrorResponse;
-use crate::state::AppState;
-use axum::{
- http::StatusCode,
- routing::{get, post},
- Json, Router,
-};
-use base64::{engine::general_purpose::STANDARD as BASE64, Engine};
-use reqwest::Client;
-use serde::{Deserialize, Serialize};
-use std::sync::Arc;
-use std::time::Duration;
-
-// Constants
-const MAX_AUDIO_SIZE_BYTES: usize = 25 * 1024 * 1024; // 25MB
-const OPENAI_TIMEOUT_SECONDS: u64 = 30;
-
-#[derive(Debug, Deserialize)]
-struct TranscribeRequest {
- audio: String, // Base64 encoded audio data
- mime_type: String,
-}
-
-#[derive(Debug, Deserialize)]
-struct TranscribeElevenLabsRequest {
- audio: String, // Base64 encoded audio data
- mime_type: String,
-}
-
-#[derive(Debug, Serialize)]
-struct TranscribeResponse {
- text: String,
-}
-
-#[derive(Debug, Deserialize)]
-struct WhisperResponse {
- text: String,
-}
-
-/// Validate audio input and return decoded bytes and file extension
-fn validate_audio_input(
- audio: &str,
- mime_type: &str,
-) -> Result<(Vec, &'static str), ErrorResponse> {
- // Decode the base64 audio data
- let audio_bytes = BASE64
- .decode(audio)
- .map_err(|_| ErrorResponse::bad_request("Invalid base64 audio data"))?;
-
- // Check file size
- if audio_bytes.len() > MAX_AUDIO_SIZE_BYTES {
- return Err(ErrorResponse {
- message: format!(
- "Audio file too large: {} bytes (max: {} bytes)",
- audio_bytes.len(),
- MAX_AUDIO_SIZE_BYTES
- ),
- status: StatusCode::PAYLOAD_TOO_LARGE,
- });
- }
-
- // Determine file extension based on MIME type
- let file_extension = match mime_type {
- "audio/webm" => "webm",
- "audio/webm;codecs=opus" => "webm",
- "audio/mp4" => "mp4",
- "audio/mpeg" => "mp3",
- "audio/mpga" => "mpga",
- "audio/m4a" => "m4a",
- "audio/wav" => "wav",
- "audio/x-wav" => "wav",
- _ => {
- return Err(ErrorResponse {
- message: format!("Unsupported audio format: {}", mime_type),
- status: StatusCode::UNSUPPORTED_MEDIA_TYPE,
- })
- }
- };
-
- Ok((audio_bytes, file_extension))
-}
-
-/// Get OpenAI configuration (API key and host)
-fn get_openai_config() -> Result<(String, String), ErrorResponse> {
- let config = goose::config::Config::global();
-
- let api_key: String = config.get_secret("OPENAI_API_KEY").map_err(|e| match e {
- goose::config::ConfigError::NotFound(_) => ErrorResponse {
- message: "OpenAI API key not configured. Please set OPENAI_API_KEY in settings."
- .to_string(),
- status: StatusCode::PRECONDITION_FAILED,
- },
- _ => ErrorResponse::internal(format!("Failed to get OpenAI API key: {:?}", e)),
- })?;
-
- let openai_host = match config.get("OPENAI_HOST", false) {
- Ok(value) => value
- .as_str()
- .map(|s| s.to_string())
- .unwrap_or_else(|| "https://api.openai.com".to_string()),
- Err(_) => "https://api.openai.com".to_string(),
- };
-
- Ok((api_key, openai_host))
-}
-
-/// Send transcription request to OpenAI Whisper API
-async fn send_openai_request(
- audio_bytes: Vec,
- file_extension: &str,
- mime_type: &str,
- api_key: &str,
- openai_host: &str,
-) -> Result {
- tracing::info!("Using OpenAI host: {}", openai_host);
- tracing::info!(
- "Audio file size: {} bytes, extension: {}, mime_type: {}",
- audio_bytes.len(),
- file_extension,
- mime_type
- );
-
- // Create a multipart form with the audio file
- let part = reqwest::multipart::Part::bytes(audio_bytes)
- .file_name(format!("audio.{}", file_extension))
- .mime_str(mime_type)
- .map_err(|e| {
- ErrorResponse::internal(format!("Failed to create multipart part: {:?}", e))
- })?;
-
- let form = reqwest::multipart::Form::new()
- .part("file", part)
- .text("model", "whisper-1")
- .text("response_format", "json");
-
- tracing::info!("Created multipart form for OpenAI Whisper API");
-
- // Make request to OpenAI Whisper API
- let client = Client::builder()
- .timeout(Duration::from_secs(OPENAI_TIMEOUT_SECONDS))
- .build()
- .map_err(|e| ErrorResponse::internal(format!("Failed to create HTTP client: {}", e)))?;
-
- tracing::info!(
- "Sending request to OpenAI: {}/v1/audio/transcriptions",
- openai_host
- );
-
- let response = client
- .post(format!("{}/v1/audio/transcriptions", openai_host))
- .header("Authorization", format!("Bearer {}", api_key))
- .multipart(form)
- .send()
- .await
- .map_err(|e| {
- if e.is_timeout() {
- ErrorResponse {
- message: format!(
- "OpenAI API request timed out after {}s",
- OPENAI_TIMEOUT_SECONDS
- ),
- status: StatusCode::GATEWAY_TIMEOUT,
- }
- } else {
- ErrorResponse {
- message: format!("Failed to send request to OpenAI: {}", e),
- status: StatusCode::SERVICE_UNAVAILABLE,
- }
- }
- })?;
-
- tracing::info!(
- "Received response from OpenAI with status: {}",
- response.status()
- );
-
- if !response.status().is_success() {
- let status = response.status();
- let error_text = response.text().await.unwrap_or_default();
- tracing::error!("OpenAI API error (status: {}): {}", status, error_text);
-
- // Check for specific error codes
- if status == 401 {
- return Err(ErrorResponse {
- message: "OpenAI API key appears to be invalid or unauthorized".to_string(),
- status: StatusCode::UNAUTHORIZED,
- });
- } else if status == 429 {
- return Err(ErrorResponse {
- message: "OpenAI API quota or rate limit exceeded".to_string(),
- status: StatusCode::TOO_MANY_REQUESTS,
- });
- }
-
- return Err(ErrorResponse {
- message: format!("OpenAI API error: {}", error_text),
- status: StatusCode::BAD_GATEWAY,
- });
- }
-
- let whisper_response: WhisperResponse = response
- .json()
- .await
- .map_err(|e| ErrorResponse::internal(format!("Failed to parse OpenAI response: {}", e)))?;
-
- Ok(whisper_response)
-}
-
-/// Transcribe audio using OpenAI's Whisper API
-///
-/// # Request
-/// - `audio`: Base64 encoded audio data
-/// - `mime_type`: MIME type of the audio (e.g., "audio/webm", "audio/wav")
-///
-/// # Response
-/// - `text`: Transcribed text from the audio
-///
-/// # Errors
-/// - 401: Unauthorized (missing or invalid X-Secret-Key header)
-/// - 412: Precondition Failed (OpenAI API key not configured)
-/// - 400: Bad Request (invalid base64 audio data)
-/// - 413: Payload Too Large (audio file exceeds 25MB limit)
-/// - 415: Unsupported Media Type (unsupported audio format)
-/// - 502: Bad Gateway (OpenAI API error)
-/// - 503: Service Unavailable (network error)
-async fn transcribe_handler(
- Json(request): Json,
-) -> Result, ErrorResponse> {
- let (audio_bytes, file_extension) = validate_audio_input(&request.audio, &request.mime_type)?;
- let (api_key, openai_host) = get_openai_config()?;
-
- let whisper_response = send_openai_request(
- audio_bytes,
- file_extension,
- &request.mime_type,
- &api_key,
- &openai_host,
- )
- .await?;
-
- Ok(Json(TranscribeResponse {
- text: whisper_response.text,
- }))
-}
-
-/// Transcribe audio using ElevenLabs Speech-to-Text API
-///
-/// Uses ElevenLabs' speech-to-text endpoint for transcription.
-/// Requires an ElevenLabs API key with speech-to-text access.
-async fn transcribe_elevenlabs_handler(
- Json(request): Json,
-) -> Result, ErrorResponse> {
- let (audio_bytes, file_extension) = validate_audio_input(&request.audio, &request.mime_type)?;
-
- // Get the ElevenLabs API key from config (after input validation)
- let config = goose::config::Config::global();
-
- // First try to get it as a secret
- let api_key: String = match config.get_secret::("ELEVENLABS_API_KEY") {
- Ok(key) => key,
- Err(_) => {
- // Try to get it as non-secret (for backward compatibility)
- match config.get("ELEVENLABS_API_KEY", false) {
- Ok(value) => {
- match value.as_str() {
- Some(key_str) => {
- let key = key_str.to_string();
- // Migrate to secret storage
- if let Err(e) = config.set(
- "ELEVENLABS_API_KEY",
- &serde_json::Value::String(key.clone()),
- true,
- ) {
- tracing::error!("Failed to migrate ElevenLabs API key: {:?}", e);
- }
- // Delete the non-secret version
- if let Err(e) = config.delete("ELEVENLABS_API_KEY") {
- tracing::warn!(
- "Failed to delete non-secret ElevenLabs API key: {:?}",
- e
- );
- }
- key
- }
- None => {
- return Err(ErrorResponse::bad_request(format!(
- "ElevenLabs API key is not a string, found: {:?}",
- value
- )));
- }
- }
- }
- Err(_) => {
- return Err(ErrorResponse::bad_request(
- "No ElevenLabs API key found in configuration",
- ));
- }
- }
- }
- };
-
- // Create multipart form for ElevenLabs API
- let part = reqwest::multipart::Part::bytes(audio_bytes)
- .file_name(format!("audio.{}", file_extension))
- .mime_str(&request.mime_type)
- .map_err(|_| ErrorResponse::internal("Failed to create multipart part"))?;
-
- let form = reqwest::multipart::Form::new()
- .part("file", part) // Changed from "audio" to "file"
- .text("model_id", "scribe_v1") // Use the correct model_id for speech-to-text
- .text("tag_audio_events", "false")
- .text("diarize", "false");
-
- // Make request to ElevenLabs Speech-to-Text API
- let client = Client::builder()
- .timeout(Duration::from_secs(OPENAI_TIMEOUT_SECONDS))
- .build()
- .map_err(|e| ErrorResponse::internal(format!("Failed to create HTTP client: {}", e)))?;
-
- let response = client
- .post("https://api.elevenlabs.io/v1/speech-to-text")
- .header("xi-api-key", &api_key)
- .multipart(form)
- .send()
- .await
- .map_err(|e| {
- if e.is_timeout() {
- ErrorResponse {
- message: format!(
- "ElevenLabs API request timed out after {}s",
- OPENAI_TIMEOUT_SECONDS
- ),
- status: StatusCode::GATEWAY_TIMEOUT,
- }
- } else {
- ErrorResponse {
- message: format!("Failed to send request to ElevenLabs: {}", e),
- status: StatusCode::SERVICE_UNAVAILABLE,
- }
- }
- })?;
-
- if !response.status().is_success() {
- let status = response.status();
- let error_text = response.text().await.unwrap_or_default();
- tracing::error!("ElevenLabs API error (status: {}): {}", status, error_text);
-
- // Check for specific error codes
- if error_text.contains("Unauthorized") || error_text.contains("Invalid API key") {
- return Err(ErrorResponse {
- message: "ElevenLabs API key is invalid or unauthorized".to_string(),
- status: StatusCode::UNAUTHORIZED,
- });
- } else if error_text.contains("quota") || error_text.contains("limit") {
- return Err(ErrorResponse {
- message: "ElevenLabs API quota or rate limit exceeded".to_string(),
- status: StatusCode::PAYMENT_REQUIRED,
- });
- }
-
- return Err(ErrorResponse {
- message: format!("ElevenLabs API error: {}", error_text),
- status: StatusCode::BAD_GATEWAY,
- });
- }
-
- // Parse ElevenLabs response
- #[derive(Debug, Deserialize)]
- struct ElevenLabsResponse {
- text: String,
- #[serde(rename = "chunks")]
- #[allow(dead_code)]
- _chunks: Option>,
- }
-
- let elevenlabs_response: ElevenLabsResponse = response.json().await.map_err(|e| {
- ErrorResponse::internal(format!("Failed to parse ElevenLabs response: {}", e))
- })?;
-
- Ok(Json(TranscribeResponse {
- text: elevenlabs_response.text,
- }))
-}
-
-/// Check if dictation providers are configured
-///
-/// Returns configuration status for dictation providers
-async fn check_dictation_config() -> Result, ErrorResponse> {
- let config = goose::config::Config::global();
-
- // Check if ElevenLabs API key is configured
- let has_elevenlabs = match config.get_secret::("ELEVENLABS_API_KEY") {
- Ok(_) => true,
- Err(_) => {
- // Check non-secret for backward compatibility
- config.get("ELEVENLABS_API_KEY", false).is_ok()
- }
- };
-
- Ok(Json(serde_json::json!({
- "elevenlabs": has_elevenlabs
- })))
-}
-
-pub fn routes(state: Arc) -> Router {
- Router::new()
- .route("/audio/transcribe", post(transcribe_handler))
- .route(
- "/audio/transcribe/elevenlabs",
- post(transcribe_elevenlabs_handler),
- )
- .route("/audio/config", get(check_dictation_config))
- .with_state(state)
-}
-
-#[cfg(test)]
-mod tests {
- use super::*;
- use axum::{body::Body, http::Request};
- use tower::ServiceExt;
- use wiremock::matchers::{method, path};
- use wiremock::{Mock, MockServer, ResponseTemplate};
-
- #[tokio::test(flavor = "multi_thread")]
- async fn test_transcribe_endpoint_requires_auth() {
- let mock_server = MockServer::start().await;
- Mock::given(method("POST"))
- .and(path("/v1/audio/transcriptions"))
- .respond_with(ResponseTemplate::new(401))
- .mount(&mock_server)
- .await;
-
- let _guard = env_lock::lock_env([
- ("OPENAI_API_KEY", Some("fake-key")),
- ("OPENAI_HOST", Some(mock_server.uri().as_str())),
- ]);
-
- let state = AppState::new().await.unwrap();
- let app = routes(state);
- let request = Request::builder()
- .uri("/audio/transcribe")
- .method("POST")
- .header("content-type", "application/json")
- .body(Body::from(
- serde_json::to_string(&serde_json::json!({
- "audio": "dGVzdA==",
- "mime_type": "audio/webm"
- }))
- .unwrap(),
- ))
- .unwrap();
-
- let response = app.oneshot(request).await.unwrap();
- assert_eq!(response.status(), StatusCode::UNAUTHORIZED);
- }
-
- #[tokio::test(flavor = "multi_thread")]
- async fn test_transcribe_endpoint_validates_size() {
- let state = AppState::new().await.unwrap();
- let app = routes(state);
-
- let request = Request::builder()
- .uri("/audio/transcribe")
- .method("POST")
- .header("content-type", "application/json")
- .header("x-secret-key", "test-secret")
- .body(Body::from(
- serde_json::to_string(&serde_json::json!({
- "audio": "dGVzdA==",
- "mime_type": "application/pdf" // Invalid MIME type
- }))
- .unwrap(),
- ))
- .unwrap();
-
- let response = app.oneshot(request).await.unwrap();
- assert!(
- response.status() == StatusCode::UNSUPPORTED_MEDIA_TYPE
- || response.status() == StatusCode::PRECONDITION_FAILED
- );
- }
-
- #[tokio::test(flavor = "multi_thread")]
- async fn test_transcribe_endpoint_validates_mime_type() {
- let state = AppState::new().await.unwrap();
- let app = routes(state);
-
- let request = Request::builder()
- .uri("/audio/transcribe")
- .method("POST")
- .header("content-type", "application/json")
- .header("x-secret-key", "test-secret")
- .body(Body::from(
- serde_json::to_string(&serde_json::json!({
- "audio": "invalid-base64-!@#$%",
- "mime_type": "audio/webm"
- }))
- .unwrap(),
- ))
- .unwrap();
-
- let response = app.oneshot(request).await.unwrap();
- assert!(
- response.status() == StatusCode::BAD_REQUEST
- || response.status() == StatusCode::PRECONDITION_FAILED
- );
- }
-}
diff --git a/crates/goose-server/src/routes/dictation.rs b/crates/goose-server/src/routes/dictation.rs
new file mode 100644
index 000000000000..d9658c7f01a4
--- /dev/null
+++ b/crates/goose-server/src/routes/dictation.rs
@@ -0,0 +1,412 @@
+use crate::routes::errors::ErrorResponse;
+use crate::state::AppState;
+use axum::{
+ http::StatusCode,
+ routing::{get, post},
+ Json, Router,
+};
+use base64::{engine::general_purpose::STANDARD as BASE64, Engine};
+use reqwest::Client;
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+use std::sync::Arc;
+use std::time::Duration;
+use utoipa::ToSchema;
+
+const MAX_AUDIO_SIZE_BYTES: usize = 25 * 1024 * 1024;
+const REQUEST_TIMEOUT: Duration = Duration::from_secs(30);
+
+// DictationProvider definitions
+struct DictationProviderDef {
+ config_key: &'static str,
+ default_url: &'static str,
+ host_key: Option<&'static str>,
+ description: &'static str,
+ uses_provider_config: bool,
+ settings_path: Option<&'static str>,
+}
+
+const PROVIDERS: &[(&str, DictationProviderDef)] = &[
+ (
+ "openai",
+ DictationProviderDef {
+ config_key: "OPENAI_API_KEY",
+ default_url: "https://api.openai.com/v1/audio/transcriptions",
+ host_key: Some("OPENAI_HOST"),
+ description: "Uses OpenAI Whisper API for high-quality transcription.",
+ uses_provider_config: true,
+ settings_path: Some("Settings > Models"),
+ },
+ ),
+ (
+ "elevenlabs",
+ DictationProviderDef {
+ config_key: "ELEVENLABS_API_KEY",
+ default_url: "https://api.elevenlabs.io/v1/speech-to-text",
+ host_key: None,
+ description: "Uses ElevenLabs speech-to-text API for advanced voice processing.",
+ uses_provider_config: false,
+ settings_path: None,
+ },
+ ),
+];
+
+fn get_provider_def(name: &str) -> Option<&'static DictationProviderDef> {
+ PROVIDERS
+ .iter()
+ .find_map(|(n, def)| if *n == name { Some(def) } else { None })
+}
+
+#[derive(Debug, Deserialize, ToSchema)]
+#[serde(rename_all = "lowercase")]
+pub enum DictationProvider {
+ OpenAI,
+ ElevenLabs,
+}
+
+impl DictationProvider {
+ fn as_str(&self) -> &'static str {
+ match self {
+ DictationProvider::OpenAI => "openai",
+ DictationProvider::ElevenLabs => "elevenlabs",
+ }
+ }
+}
+
+#[derive(Debug, Deserialize, ToSchema)]
+pub struct TranscribeRequest {
+ /// Base64 encoded audio data
+ pub audio: String,
+ /// MIME type of the audio (e.g., "audio/webm", "audio/wav")
+ pub mime_type: String,
+ /// Transcription provider to use
+ pub provider: DictationProvider,
+}
+
+#[derive(Debug, Serialize, Deserialize, ToSchema)]
+pub struct TranscribeResponse {
+ /// Transcribed text from the audio
+ pub text: String,
+}
+
+#[derive(Debug, Serialize, ToSchema)]
+pub struct DictationProviderStatus {
+ /// Whether the provider is fully configured and ready to use
+ pub configured: bool,
+ /// Custom host URL if configured (only for providers that support it)
+ #[serde(skip_serializing_if = "Option::is_none")]
+ pub host: Option,
+ /// Description of what this provider does
+ pub description: String,
+ /// Whether this provider uses the main provider config (true) or has its own key (false)
+ pub uses_provider_config: bool,
+ /// Path to settings if uses_provider_config is true
+ #[serde(skip_serializing_if = "Option::is_none")]
+ pub settings_path: Option,
+ /// Config key name if uses_provider_config is false
+ #[serde(skip_serializing_if = "Option::is_none")]
+ pub config_key: Option,
+}
+
+fn validate_audio(audio: &str, mime_type: &str) -> Result<(Vec, &'static str), ErrorResponse> {
+ let audio_bytes = BASE64
+ .decode(audio)
+ .map_err(|_| ErrorResponse::bad_request("Invalid base64 audio data"))?;
+
+ if audio_bytes.len() > MAX_AUDIO_SIZE_BYTES {
+ return Err(ErrorResponse {
+ message: format!(
+ "Audio file too large: {} bytes (max: {} bytes)",
+ audio_bytes.len(),
+ MAX_AUDIO_SIZE_BYTES
+ ),
+ status: StatusCode::PAYLOAD_TOO_LARGE,
+ });
+ }
+
+ let extension = match mime_type {
+ "audio/webm" | "audio/webm;codecs=opus" => "webm",
+ "audio/mp4" => "mp4",
+ "audio/mpeg" | "audio/mpga" => "mp3",
+ "audio/m4a" => "m4a",
+ "audio/wav" | "audio/x-wav" => "wav",
+ _ => {
+ return Err(ErrorResponse {
+ message: format!("Unsupported audio format: {}", mime_type),
+ status: StatusCode::UNSUPPORTED_MEDIA_TYPE,
+ })
+ }
+ };
+
+ Ok((audio_bytes, extension))
+}
+
+fn get_provider_config(provider: &str) -> Result<(String, String), ErrorResponse> {
+ let config = goose::config::Config::global();
+ let def = get_provider_def(provider)
+ .ok_or_else(|| ErrorResponse::bad_request(format!("Unknown provider: {}", provider)))?;
+
+ let api_key = config
+ .get_secret(def.config_key)
+ .map_err(|_| ErrorResponse {
+ message: format!("{} not configured", def.config_key),
+ status: StatusCode::PRECONDITION_FAILED,
+ })?;
+
+ let url = if let Some(host_key) = def.host_key {
+ // If host_key is configured, replace the host part of the default URL
+ if let Some(custom_host) = config
+ .get(host_key, false)
+ .ok()
+ .and_then(|v| v.as_str().map(|s| s.to_string()))
+ {
+ // Extract the path from default_url (everything after the third slash)
+ // e.g., "https://api.openai.com/v1/audio/transcriptions" -> "/v1/audio/transcriptions"
+ let path = def
+ .default_url
+ .splitn(4, '/')
+ .nth(3)
+ .map(|p| format!("/{}", p))
+ .unwrap_or_else(|| "".to_string());
+
+ // Remove trailing slash from custom host if present
+ let custom_host = custom_host.trim_end_matches('/');
+
+ format!("{}{}", custom_host, path)
+ } else {
+ def.default_url.to_string()
+ }
+ } else {
+ def.default_url.to_string()
+ };
+
+ Ok((api_key, url))
+}
+
+async fn transcribe_openai(
+ audio_bytes: Vec,
+ extension: &str,
+ mime_type: &str,
+ api_key: &str,
+ url: &str,
+) -> Result {
+ let part = reqwest::multipart::Part::bytes(audio_bytes)
+ .file_name(format!("audio.{}", extension))
+ .mime_str(mime_type)
+ .map_err(|e| ErrorResponse::internal(format!("Failed to create multipart: {}", e)))?;
+
+ let form = reqwest::multipart::Form::new()
+ .part("file", part)
+ .text("model", "whisper-1");
+
+ let client = Client::builder()
+ .timeout(REQUEST_TIMEOUT)
+ .build()
+ .map_err(|e| ErrorResponse::internal(format!("Failed to create client: {}", e)))?;
+
+ let response = client
+ .post(url)
+ .header("Authorization", format!("Bearer {}", api_key))
+ .multipart(form)
+ .send()
+ .await
+ .map_err(|e| {
+ if e.is_timeout() {
+ ErrorResponse {
+ message: "Request timed out".to_string(),
+ status: StatusCode::GATEWAY_TIMEOUT,
+ }
+ } else {
+ ErrorResponse {
+ message: format!("Request failed: {}", e),
+ status: StatusCode::SERVICE_UNAVAILABLE,
+ }
+ }
+ })?;
+
+ if !response.status().is_success() {
+ let status = response.status();
+ let error_text = response.text().await.unwrap_or_default();
+
+ return Err(ErrorResponse {
+ message: if status == 401 {
+ "Invalid API key".to_string()
+ } else if status == 429 {
+ "Rate limit exceeded".to_string()
+ } else {
+ format!("API error: {}", error_text)
+ },
+ status: if status.is_client_error() {
+ status
+ } else {
+ StatusCode::BAD_GATEWAY
+ },
+ });
+ }
+
+ let data: TranscribeResponse = response
+ .json()
+ .await
+ .map_err(|e| ErrorResponse::internal(format!("Failed to parse response: {}", e)))?;
+
+ Ok(data.text)
+}
+
+async fn transcribe_elevenlabs(
+ audio_bytes: Vec,
+ extension: &str,
+ mime_type: &str,
+ api_key: &str,
+ url: &str,
+) -> Result {
+ let part = reqwest::multipart::Part::bytes(audio_bytes)
+ .file_name(format!("audio.{}", extension))
+ .mime_str(mime_type)
+ .map_err(|_| ErrorResponse::internal("Failed to create multipart"))?;
+
+ let form = reqwest::multipart::Form::new()
+ .part("file", part)
+ .text("model_id", "scribe_v1");
+
+ let client = Client::builder()
+ .timeout(REQUEST_TIMEOUT)
+ .build()
+ .map_err(|e| ErrorResponse::internal(format!("Failed to create client: {}", e)))?;
+
+ let response = client
+ .post(url)
+ .header("xi-api-key", api_key)
+ .multipart(form)
+ .send()
+ .await
+ .map_err(|e| {
+ if e.is_timeout() {
+ ErrorResponse {
+ message: "Request timed out".to_string(),
+ status: StatusCode::GATEWAY_TIMEOUT,
+ }
+ } else {
+ ErrorResponse {
+ message: format!("Request failed: {}", e),
+ status: StatusCode::SERVICE_UNAVAILABLE,
+ }
+ }
+ })?;
+
+ if !response.status().is_success() {
+ let status = response.status();
+ let error_text = response.text().await.unwrap_or_default();
+
+ return Err(ErrorResponse {
+ message: if error_text.contains("Unauthorized")
+ || error_text.contains("Invalid API key")
+ {
+ "Invalid API key".to_string()
+ } else if error_text.contains("quota") || error_text.contains("limit") {
+ "Rate limit exceeded".to_string()
+ } else {
+ format!("API error: {}", error_text)
+ },
+ status: if status.is_client_error() {
+ status
+ } else {
+ StatusCode::BAD_GATEWAY
+ },
+ });
+ }
+
+ let data: TranscribeResponse = response
+ .json()
+ .await
+ .map_err(|e| ErrorResponse::internal(format!("Failed to parse response: {}", e)))?;
+
+ Ok(data.text)
+}
+
+#[utoipa::path(
+ post,
+ path = "/dictation/transcribe",
+ request_body = TranscribeRequest,
+ responses(
+ (status = 200, description = "Audio transcribed successfully", body = TranscribeResponse),
+ (status = 400, description = "Invalid request (bad base64 or unsupported format)"),
+ (status = 401, description = "Invalid API key"),
+ (status = 412, description = "DictationProvider not configured"),
+ (status = 413, description = "Audio file too large (max 25MB)"),
+ (status = 429, description = "Rate limit exceeded"),
+ (status = 500, description = "Internal server error"),
+ (status = 502, description = "DictationProvider API error"),
+ (status = 503, description = "Service unavailable"),
+ (status = 504, description = "Request timeout")
+ )
+)]
+pub async fn transcribe_dictation(
+ Json(request): Json,
+) -> Result, ErrorResponse> {
+ let (audio_bytes, extension) = validate_audio(&request.audio, &request.mime_type)?;
+ let provider_name = request.provider.as_str();
+ let (api_key, url) = get_provider_config(provider_name)?;
+
+ let text = match request.provider {
+ DictationProvider::OpenAI => {
+ transcribe_openai(audio_bytes, extension, &request.mime_type, &api_key, &url).await?
+ }
+ DictationProvider::ElevenLabs => {
+ transcribe_elevenlabs(audio_bytes, extension, &request.mime_type, &api_key, &url)
+ .await?
+ }
+ };
+
+ Ok(Json(TranscribeResponse { text }))
+}
+
+#[utoipa::path(
+ get,
+ path = "/dictation/config",
+ responses(
+ (status = 200, description = "Audio transcription provider configurations", body = HashMap)
+ )
+)]
+pub async fn get_dictation_config(
+) -> Result>, ErrorResponse> {
+ let config = goose::config::Config::global();
+ let mut providers = HashMap::new();
+
+ for (name, def) in PROVIDERS.iter() {
+ let configured = config.get_secret::(def.config_key).is_ok();
+
+ let host = if let Some(host_key) = def.host_key {
+ config
+ .get(host_key, false)
+ .ok()
+ .and_then(|v| v.as_str().map(|s| s.to_string()))
+ } else {
+ None
+ };
+
+ providers.insert(
+ name.to_string(),
+ DictationProviderStatus {
+ configured,
+ host,
+ description: def.description.to_string(),
+ uses_provider_config: def.uses_provider_config,
+ settings_path: def.settings_path.map(|s| s.to_string()),
+ config_key: if !def.uses_provider_config {
+ Some(def.config_key.to_string())
+ } else {
+ None
+ },
+ },
+ );
+ }
+
+ Ok(Json(providers))
+}
+
+pub fn routes(state: Arc) -> Router {
+ Router::new()
+ .route("/dictation/transcribe", post(transcribe_dictation))
+ .route("/dictation/config", get(get_dictation_config))
+ .with_state(state)
+}
diff --git a/crates/goose-server/src/routes/mod.rs b/crates/goose-server/src/routes/mod.rs
index d65241cc0d75..e0935c2476a8 100644
--- a/crates/goose-server/src/routes/mod.rs
+++ b/crates/goose-server/src/routes/mod.rs
@@ -1,7 +1,7 @@
pub mod action_required;
pub mod agent;
-pub mod audio;
pub mod config_management;
+pub mod dictation;
pub mod errors;
pub mod mcp_app_proxy;
pub mod mcp_ui_proxy;
@@ -28,7 +28,7 @@ pub fn configure(state: Arc, secret_key: String) -> Rout
.merge(reply::routes(state.clone()))
.merge(action_required::routes(state.clone()))
.merge(agent::routes(state.clone()))
- .merge(audio::routes(state.clone()))
+ .merge(dictation::routes(state.clone()))
.merge(config_management::routes(state.clone()))
.merge(prompts::routes())
.merge(recipe::routes(state.clone()))
diff --git a/ui/desktop/openapi.json b/ui/desktop/openapi.json
index 1b56e711b402..806cb04ba92c 100644
--- a/ui/desktop/openapi.json
+++ b/ui/desktop/openapi.json
@@ -1571,6 +1571,86 @@
}
}
},
+ "/dictation/config": {
+ "get": {
+ "tags": [
+ "super::routes::dictation"
+ ],
+ "operationId": "get_dictation_config",
+ "responses": {
+ "200": {
+ "description": "Audio transcription provider configurations",
+ "content": {
+ "application/json": {
+ "schema": {
+ "type": "object",
+ "additionalProperties": {
+ "$ref": "#/components/schemas/DictationProviderStatus"
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "/dictation/transcribe": {
+ "post": {
+ "tags": [
+ "super::routes::dictation"
+ ],
+ "operationId": "transcribe_dictation",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/TranscribeRequest"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Audio transcribed successfully",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/TranscribeResponse"
+ }
+ }
+ }
+ },
+ "400": {
+ "description": "Invalid request (bad base64 or unsupported format)"
+ },
+ "401": {
+ "description": "Invalid API key"
+ },
+ "412": {
+ "description": "DictationProvider not configured"
+ },
+ "413": {
+ "description": "Audio file too large (max 25MB)"
+ },
+ "429": {
+ "description": "Rate limit exceeded"
+ },
+ "500": {
+ "description": "Internal server error"
+ },
+ "502": {
+ "description": "DictationProvider API error"
+ },
+ "503": {
+ "description": "Service unavailable"
+ },
+ "504": {
+ "description": "Request timeout"
+ }
+ }
+ }
+ },
"/handle_openrouter": {
"post": {
"tags": [
@@ -3593,6 +3673,50 @@
}
}
},
+ "DictationProvider": {
+ "type": "string",
+ "enum": [
+ "openai",
+ "elevenlabs"
+ ]
+ },
+ "DictationProviderStatus": {
+ "type": "object",
+ "required": [
+ "configured",
+ "description",
+ "uses_provider_config"
+ ],
+ "properties": {
+ "config_key": {
+ "type": "string",
+ "description": "Config key name if uses_provider_config is false",
+ "nullable": true
+ },
+ "configured": {
+ "type": "boolean",
+ "description": "Whether the provider is fully configured and ready to use"
+ },
+ "description": {
+ "type": "string",
+ "description": "Description of what this provider does"
+ },
+ "host": {
+ "type": "string",
+ "description": "Custom host URL if configured (only for providers that support it)",
+ "nullable": true
+ },
+ "settings_path": {
+ "type": "string",
+ "description": "Path to settings if uses_provider_config is true",
+ "nullable": true
+ },
+ "uses_provider_config": {
+ "type": "boolean",
+ "description": "Whether this provider uses the main provider config (true) or has its own key (false)"
+ }
+ }
+ },
"EmbeddedResource": {
"type": "object",
"required": [
@@ -6612,6 +6736,39 @@
}
}
},
+ "TranscribeRequest": {
+ "type": "object",
+ "required": [
+ "audio",
+ "mime_type",
+ "provider"
+ ],
+ "properties": {
+ "audio": {
+ "type": "string",
+ "description": "Base64 encoded audio data"
+ },
+ "mime_type": {
+ "type": "string",
+ "description": "MIME type of the audio (e.g., \"audio/webm\", \"audio/wav\")"
+ },
+ "provider": {
+ "$ref": "#/components/schemas/DictationProvider"
+ }
+ }
+ },
+ "TranscribeResponse": {
+ "type": "object",
+ "required": [
+ "text"
+ ],
+ "properties": {
+ "text": {
+ "type": "string",
+ "description": "Transcribed text from the audio"
+ }
+ }
+ },
"TunnelInfo": {
"type": "object",
"required": [
diff --git a/ui/desktop/src/api/index.ts b/ui/desktop/src/api/index.ts
index 4ff4623ae0a1..0b6cd8f14735 100644
--- a/ui/desktop/src/api/index.ts
+++ b/ui/desktop/src/api/index.ts
@@ -1,4 +1,4 @@
// This file is auto-generated by @hey-api/openapi-ts
-export { addExtension, agentAddExtension, agentRemoveExtension, backupConfig, callTool, checkProvider, configureProviderOauth, confirmToolAction, createCustomProvider, createRecipe, createSchedule, decodeRecipe, deleteRecipe, deleteSchedule, deleteSession, detectProvider, diagnostics, encodeRecipe, exportApp, exportSession, forkSession, getCustomProvider, getExtensions, getPricing, getPrompt, getPrompts, getProviderModels, getSession, getSessionExtensions, getSessionInsights, getSlashCommands, getTools, getTunnelStatus, importApp, importSession, initConfig, inspectRunningJob, killRunningJob, listApps, listRecipes, listSchedules, listSessions, mcpUiProxy, type Options, parseRecipe, pauseSchedule, providers, readAllConfig, readConfig, readResource, recipeToYaml, recoverConfig, removeConfig, removeCustomProvider, removeExtension, reply, resetPrompt, restartAgent, resumeAgent, runNowHandler, savePrompt, saveRecipe, scanRecipe, scheduleRecipe, sendTelemetryEvent, sessionsHandler, setConfigProvider, setRecipeSlashCommand, startAgent, startOpenrouterSetup, startTetrateSetup, startTunnel, status, stopAgent, stopTunnel, systemInfo, unpauseSchedule, updateAgentProvider, updateCustomProvider, updateFromSession, updateSchedule, updateSessionName, updateSessionUserRecipeValues, updateWorkingDir, upsertConfig, upsertPermissions, validateConfig } from './sdk.gen';
-export type { ActionRequired, ActionRequiredData, AddExtensionData, AddExtensionErrors, AddExtensionRequest, AddExtensionResponse, AddExtensionResponses, AgentAddExtensionData, AgentAddExtensionErrors, AgentAddExtensionResponse, AgentAddExtensionResponses, AgentRemoveExtensionData, AgentRemoveExtensionErrors, AgentRemoveExtensionResponse, AgentRemoveExtensionResponses, Annotations, Author, AuthorRequest, BackupConfigData, BackupConfigErrors, BackupConfigResponse, BackupConfigResponses, CallToolData, CallToolErrors, CallToolRequest, CallToolResponse, CallToolResponse2, CallToolResponses, ChatRequest, CheckProviderData, CheckProviderRequest, ClientOptions, CommandType, ConfigKey, ConfigKeyQuery, ConfigResponse, ConfigureProviderOauthData, ConfigureProviderOauthErrors, ConfigureProviderOauthResponses, ConfirmToolActionData, ConfirmToolActionErrors, ConfirmToolActionRequest, ConfirmToolActionResponses, Content, Conversation, CreateCustomProviderData, CreateCustomProviderErrors, CreateCustomProviderResponse, CreateCustomProviderResponses, CreateRecipeData, CreateRecipeErrors, CreateRecipeRequest, CreateRecipeResponse, CreateRecipeResponse2, CreateRecipeResponses, CreateScheduleData, CreateScheduleErrors, CreateScheduleRequest, CreateScheduleResponse, CreateScheduleResponses, CspMetadata, DeclarativeProviderConfig, DecodeRecipeData, DecodeRecipeErrors, DecodeRecipeRequest, DecodeRecipeResponse, DecodeRecipeResponse2, DecodeRecipeResponses, DeleteRecipeData, DeleteRecipeErrors, DeleteRecipeRequest, DeleteRecipeResponse, DeleteRecipeResponses, DeleteScheduleData, DeleteScheduleErrors, DeleteScheduleResponse, DeleteScheduleResponses, DeleteSessionData, DeleteSessionErrors, DeleteSessionResponses, DetectProviderData, DetectProviderErrors, DetectProviderRequest, DetectProviderResponse, DetectProviderResponse2, DetectProviderResponses, DiagnosticsData, DiagnosticsErrors, DiagnosticsResponse, DiagnosticsResponses, EmbeddedResource, EncodeRecipeData, EncodeRecipeErrors, EncodeRecipeRequest, EncodeRecipeResponse, EncodeRecipeResponse2, EncodeRecipeResponses, Envs, ErrorResponse, ExportAppData, ExportAppError, ExportAppErrors, ExportAppResponse, ExportAppResponses, ExportSessionData, ExportSessionErrors, ExportSessionResponse, ExportSessionResponses, ExtensionConfig, ExtensionData, ExtensionEntry, ExtensionLoadResult, ExtensionQuery, ExtensionResponse, ForkRequest, ForkResponse, ForkSessionData, ForkSessionErrors, ForkSessionResponse, ForkSessionResponses, FrontendToolRequest, GetCustomProviderData, GetCustomProviderErrors, GetCustomProviderResponse, GetCustomProviderResponses, GetExtensionsData, GetExtensionsErrors, GetExtensionsResponse, GetExtensionsResponses, GetPricingData, GetPricingResponse, GetPricingResponses, GetPromptData, GetPromptErrors, GetPromptResponse, GetPromptResponses, GetPromptsData, GetPromptsResponse, GetPromptsResponses, GetProviderModelsData, GetProviderModelsErrors, GetProviderModelsResponse, GetProviderModelsResponses, GetSessionData, GetSessionErrors, GetSessionExtensionsData, GetSessionExtensionsErrors, GetSessionExtensionsResponse, GetSessionExtensionsResponses, GetSessionInsightsData, GetSessionInsightsErrors, GetSessionInsightsResponse, GetSessionInsightsResponses, GetSessionResponse, GetSessionResponses, GetSlashCommandsData, GetSlashCommandsResponse, GetSlashCommandsResponses, GetToolsData, GetToolsErrors, GetToolsQuery, GetToolsResponse, GetToolsResponses, GetTunnelStatusData, GetTunnelStatusResponse, GetTunnelStatusResponses, GooseApp, Icon, ImageContent, ImportAppData, ImportAppError, ImportAppErrors, ImportAppRequest, ImportAppResponse, ImportAppResponse2, ImportAppResponses, ImportSessionData, ImportSessionErrors, ImportSessionRequest, ImportSessionResponse, ImportSessionResponses, InitConfigData, InitConfigErrors, InitConfigResponse, InitConfigResponses, InspectJobResponse, InspectRunningJobData, InspectRunningJobErrors, InspectRunningJobResponse, InspectRunningJobResponses, JsonObject, KillJobResponse, KillRunningJobData, KillRunningJobResponses, ListAppsData, ListAppsError, ListAppsErrors, ListAppsRequest, ListAppsResponse, ListAppsResponse2, ListAppsResponses, ListRecipeResponse, ListRecipesData, ListRecipesErrors, ListRecipesResponse, ListRecipesResponses, ListSchedulesData, ListSchedulesErrors, ListSchedulesResponse, ListSchedulesResponse2, ListSchedulesResponses, ListSessionsData, ListSessionsErrors, ListSessionsResponse, ListSessionsResponses, LoadedProvider, McpAppResource, McpUiProxyData, McpUiProxyErrors, McpUiProxyResponses, Message, MessageContent, MessageEvent, MessageMetadata, ModelConfig, ModelInfo, ParseRecipeData, ParseRecipeError, ParseRecipeErrors, ParseRecipeRequest, ParseRecipeResponse, ParseRecipeResponse2, ParseRecipeResponses, PauseScheduleData, PauseScheduleErrors, PauseScheduleResponse, PauseScheduleResponses, PermissionLevel, PricingData, PricingQuery, PricingResponse, PrincipalType, PromptContentResponse, PromptsListResponse, ProviderDetails, ProviderEngine, ProviderMetadata, ProvidersData, ProvidersResponse, ProvidersResponse2, ProvidersResponses, ProviderType, RawAudioContent, RawEmbeddedResource, RawImageContent, RawResource, RawTextContent, ReadAllConfigData, ReadAllConfigResponse, ReadAllConfigResponses, ReadConfigData, ReadConfigErrors, ReadConfigResponses, ReadResourceData, ReadResourceErrors, ReadResourceRequest, ReadResourceResponse, ReadResourceResponse2, ReadResourceResponses, Recipe, RecipeManifest, RecipeParameter, RecipeParameterInputType, RecipeParameterRequirement, RecipeToYamlData, RecipeToYamlError, RecipeToYamlErrors, RecipeToYamlRequest, RecipeToYamlResponse, RecipeToYamlResponse2, RecipeToYamlResponses, RecoverConfigData, RecoverConfigErrors, RecoverConfigResponse, RecoverConfigResponses, RedactedThinkingContent, RemoveConfigData, RemoveConfigErrors, RemoveConfigResponse, RemoveConfigResponses, RemoveCustomProviderData, RemoveCustomProviderErrors, RemoveCustomProviderResponse, RemoveCustomProviderResponses, RemoveExtensionData, RemoveExtensionErrors, RemoveExtensionRequest, RemoveExtensionResponse, RemoveExtensionResponses, ReplyData, ReplyErrors, ReplyResponse, ReplyResponses, ResetPromptData, ResetPromptErrors, ResetPromptResponse, ResetPromptResponses, ResourceContents, ResourceMetadata, Response, RestartAgentData, RestartAgentErrors, RestartAgentRequest, RestartAgentResponse, RestartAgentResponse2, RestartAgentResponses, ResumeAgentData, ResumeAgentErrors, ResumeAgentRequest, ResumeAgentResponse, ResumeAgentResponse2, ResumeAgentResponses, RetryConfig, Role, RunNowHandlerData, RunNowHandlerErrors, RunNowHandlerResponse, RunNowHandlerResponses, RunNowResponse, SavePromptData, SavePromptErrors, SavePromptRequest, SavePromptResponse, SavePromptResponses, SaveRecipeData, SaveRecipeError, SaveRecipeErrors, SaveRecipeRequest, SaveRecipeResponse, SaveRecipeResponse2, SaveRecipeResponses, ScanRecipeData, ScanRecipeRequest, ScanRecipeResponse, ScanRecipeResponse2, ScanRecipeResponses, ScheduledJob, ScheduleRecipeData, ScheduleRecipeErrors, ScheduleRecipeRequest, ScheduleRecipeResponses, SendTelemetryEventData, SendTelemetryEventResponses, Session, SessionDisplayInfo, SessionExtensionsResponse, SessionInsights, SessionListResponse, SessionsHandlerData, SessionsHandlerErrors, SessionsHandlerResponse, SessionsHandlerResponses, SessionsQuery, SessionType, SetConfigProviderData, SetProviderRequest, SetRecipeSlashCommandData, SetRecipeSlashCommandErrors, SetRecipeSlashCommandResponses, SetSlashCommandRequest, Settings, SetupResponse, SlashCommand, SlashCommandsResponse, StartAgentData, StartAgentError, StartAgentErrors, StartAgentRequest, StartAgentResponse, StartAgentResponses, StartOpenrouterSetupData, StartOpenrouterSetupResponse, StartOpenrouterSetupResponses, StartTetrateSetupData, StartTetrateSetupResponse, StartTetrateSetupResponses, StartTunnelData, StartTunnelError, StartTunnelErrors, StartTunnelResponse, StartTunnelResponses, StatusData, StatusResponse, StatusResponses, StopAgentData, StopAgentErrors, StopAgentRequest, StopAgentResponse, StopAgentResponses, StopTunnelData, StopTunnelError, StopTunnelErrors, StopTunnelResponses, SubRecipe, SuccessCheck, SystemInfo, SystemInfoData, SystemInfoResponse, SystemInfoResponses, SystemNotificationContent, SystemNotificationType, TelemetryEventRequest, Template, TextContent, ThinkingContent, TokenState, Tool, ToolAnnotations, ToolConfirmationRequest, ToolInfo, ToolPermission, ToolRequest, ToolResponse, TunnelInfo, TunnelState, UiMetadata, UnpauseScheduleData, UnpauseScheduleErrors, UnpauseScheduleResponse, UnpauseScheduleResponses, UpdateAgentProviderData, UpdateAgentProviderErrors, UpdateAgentProviderResponses, UpdateCustomProviderData, UpdateCustomProviderErrors, UpdateCustomProviderRequest, UpdateCustomProviderResponse, UpdateCustomProviderResponses, UpdateFromSessionData, UpdateFromSessionErrors, UpdateFromSessionRequest, UpdateFromSessionResponses, UpdateProviderRequest, UpdateScheduleData, UpdateScheduleErrors, UpdateScheduleRequest, UpdateScheduleResponse, UpdateScheduleResponses, UpdateSessionNameData, UpdateSessionNameErrors, UpdateSessionNameRequest, UpdateSessionNameResponses, UpdateSessionUserRecipeValuesData, UpdateSessionUserRecipeValuesError, UpdateSessionUserRecipeValuesErrors, UpdateSessionUserRecipeValuesRequest, UpdateSessionUserRecipeValuesResponse, UpdateSessionUserRecipeValuesResponse2, UpdateSessionUserRecipeValuesResponses, UpdateWorkingDirData, UpdateWorkingDirErrors, UpdateWorkingDirRequest, UpdateWorkingDirResponses, UpsertConfigData, UpsertConfigErrors, UpsertConfigQuery, UpsertConfigResponse, UpsertConfigResponses, UpsertPermissionsData, UpsertPermissionsErrors, UpsertPermissionsQuery, UpsertPermissionsResponse, UpsertPermissionsResponses, ValidateConfigData, ValidateConfigErrors, ValidateConfigResponse, ValidateConfigResponses, WindowProps } from './types.gen';
+export { addExtension, agentAddExtension, agentRemoveExtension, backupConfig, callTool, checkProvider, configureProviderOauth, confirmToolAction, createCustomProvider, createRecipe, createSchedule, decodeRecipe, deleteRecipe, deleteSchedule, deleteSession, detectProvider, diagnostics, encodeRecipe, exportApp, exportSession, forkSession, getCustomProvider, getDictationConfig, getExtensions, getPricing, getPrompt, getPrompts, getProviderModels, getSession, getSessionExtensions, getSessionInsights, getSlashCommands, getTools, getTunnelStatus, importApp, importSession, initConfig, inspectRunningJob, killRunningJob, listApps, listRecipes, listSchedules, listSessions, mcpUiProxy, type Options, parseRecipe, pauseSchedule, providers, readAllConfig, readConfig, readResource, recipeToYaml, recoverConfig, removeConfig, removeCustomProvider, removeExtension, reply, resetPrompt, restartAgent, resumeAgent, runNowHandler, savePrompt, saveRecipe, scanRecipe, scheduleRecipe, sendTelemetryEvent, sessionsHandler, setConfigProvider, setRecipeSlashCommand, startAgent, startOpenrouterSetup, startTetrateSetup, startTunnel, status, stopAgent, stopTunnel, systemInfo, transcribeDictation, unpauseSchedule, updateAgentProvider, updateCustomProvider, updateFromSession, updateSchedule, updateSessionName, updateSessionUserRecipeValues, updateWorkingDir, upsertConfig, upsertPermissions, validateConfig } from './sdk.gen';
+export type { ActionRequired, ActionRequiredData, AddExtensionData, AddExtensionErrors, AddExtensionRequest, AddExtensionResponse, AddExtensionResponses, AgentAddExtensionData, AgentAddExtensionErrors, AgentAddExtensionResponse, AgentAddExtensionResponses, AgentRemoveExtensionData, AgentRemoveExtensionErrors, AgentRemoveExtensionResponse, AgentRemoveExtensionResponses, Annotations, Author, AuthorRequest, BackupConfigData, BackupConfigErrors, BackupConfigResponse, BackupConfigResponses, CallToolData, CallToolErrors, CallToolRequest, CallToolResponse, CallToolResponse2, CallToolResponses, ChatRequest, CheckProviderData, CheckProviderRequest, ClientOptions, CommandType, ConfigKey, ConfigKeyQuery, ConfigResponse, ConfigureProviderOauthData, ConfigureProviderOauthErrors, ConfigureProviderOauthResponses, ConfirmToolActionData, ConfirmToolActionErrors, ConfirmToolActionRequest, ConfirmToolActionResponses, Content, Conversation, CreateCustomProviderData, CreateCustomProviderErrors, CreateCustomProviderResponse, CreateCustomProviderResponses, CreateRecipeData, CreateRecipeErrors, CreateRecipeRequest, CreateRecipeResponse, CreateRecipeResponse2, CreateRecipeResponses, CreateScheduleData, CreateScheduleErrors, CreateScheduleRequest, CreateScheduleResponse, CreateScheduleResponses, CspMetadata, DeclarativeProviderConfig, DecodeRecipeData, DecodeRecipeErrors, DecodeRecipeRequest, DecodeRecipeResponse, DecodeRecipeResponse2, DecodeRecipeResponses, DeleteRecipeData, DeleteRecipeErrors, DeleteRecipeRequest, DeleteRecipeResponse, DeleteRecipeResponses, DeleteScheduleData, DeleteScheduleErrors, DeleteScheduleResponse, DeleteScheduleResponses, DeleteSessionData, DeleteSessionErrors, DeleteSessionResponses, DetectProviderData, DetectProviderErrors, DetectProviderRequest, DetectProviderResponse, DetectProviderResponse2, DetectProviderResponses, DiagnosticsData, DiagnosticsErrors, DiagnosticsResponse, DiagnosticsResponses, DictationProvider, DictationProviderStatus, EmbeddedResource, EncodeRecipeData, EncodeRecipeErrors, EncodeRecipeRequest, EncodeRecipeResponse, EncodeRecipeResponse2, EncodeRecipeResponses, Envs, ErrorResponse, ExportAppData, ExportAppError, ExportAppErrors, ExportAppResponse, ExportAppResponses, ExportSessionData, ExportSessionErrors, ExportSessionResponse, ExportSessionResponses, ExtensionConfig, ExtensionData, ExtensionEntry, ExtensionLoadResult, ExtensionQuery, ExtensionResponse, ForkRequest, ForkResponse, ForkSessionData, ForkSessionErrors, ForkSessionResponse, ForkSessionResponses, FrontendToolRequest, GetCustomProviderData, GetCustomProviderErrors, GetCustomProviderResponse, GetCustomProviderResponses, GetDictationConfigData, GetDictationConfigResponse, GetDictationConfigResponses, GetExtensionsData, GetExtensionsErrors, GetExtensionsResponse, GetExtensionsResponses, GetPricingData, GetPricingResponse, GetPricingResponses, GetPromptData, GetPromptErrors, GetPromptResponse, GetPromptResponses, GetPromptsData, GetPromptsResponse, GetPromptsResponses, GetProviderModelsData, GetProviderModelsErrors, GetProviderModelsResponse, GetProviderModelsResponses, GetSessionData, GetSessionErrors, GetSessionExtensionsData, GetSessionExtensionsErrors, GetSessionExtensionsResponse, GetSessionExtensionsResponses, GetSessionInsightsData, GetSessionInsightsErrors, GetSessionInsightsResponse, GetSessionInsightsResponses, GetSessionResponse, GetSessionResponses, GetSlashCommandsData, GetSlashCommandsResponse, GetSlashCommandsResponses, GetToolsData, GetToolsErrors, GetToolsQuery, GetToolsResponse, GetToolsResponses, GetTunnelStatusData, GetTunnelStatusResponse, GetTunnelStatusResponses, GooseApp, Icon, ImageContent, ImportAppData, ImportAppError, ImportAppErrors, ImportAppRequest, ImportAppResponse, ImportAppResponse2, ImportAppResponses, ImportSessionData, ImportSessionErrors, ImportSessionRequest, ImportSessionResponse, ImportSessionResponses, InitConfigData, InitConfigErrors, InitConfigResponse, InitConfigResponses, InspectJobResponse, InspectRunningJobData, InspectRunningJobErrors, InspectRunningJobResponse, InspectRunningJobResponses, JsonObject, KillJobResponse, KillRunningJobData, KillRunningJobResponses, ListAppsData, ListAppsError, ListAppsErrors, ListAppsRequest, ListAppsResponse, ListAppsResponse2, ListAppsResponses, ListRecipeResponse, ListRecipesData, ListRecipesErrors, ListRecipesResponse, ListRecipesResponses, ListSchedulesData, ListSchedulesErrors, ListSchedulesResponse, ListSchedulesResponse2, ListSchedulesResponses, ListSessionsData, ListSessionsErrors, ListSessionsResponse, ListSessionsResponses, LoadedProvider, McpAppResource, McpUiProxyData, McpUiProxyErrors, McpUiProxyResponses, Message, MessageContent, MessageEvent, MessageMetadata, ModelConfig, ModelInfo, ParseRecipeData, ParseRecipeError, ParseRecipeErrors, ParseRecipeRequest, ParseRecipeResponse, ParseRecipeResponse2, ParseRecipeResponses, PauseScheduleData, PauseScheduleErrors, PauseScheduleResponse, PauseScheduleResponses, PermissionLevel, PricingData, PricingQuery, PricingResponse, PrincipalType, PromptContentResponse, PromptsListResponse, ProviderDetails, ProviderEngine, ProviderMetadata, ProvidersData, ProvidersResponse, ProvidersResponse2, ProvidersResponses, ProviderType, RawAudioContent, RawEmbeddedResource, RawImageContent, RawResource, RawTextContent, ReadAllConfigData, ReadAllConfigResponse, ReadAllConfigResponses, ReadConfigData, ReadConfigErrors, ReadConfigResponses, ReadResourceData, ReadResourceErrors, ReadResourceRequest, ReadResourceResponse, ReadResourceResponse2, ReadResourceResponses, Recipe, RecipeManifest, RecipeParameter, RecipeParameterInputType, RecipeParameterRequirement, RecipeToYamlData, RecipeToYamlError, RecipeToYamlErrors, RecipeToYamlRequest, RecipeToYamlResponse, RecipeToYamlResponse2, RecipeToYamlResponses, RecoverConfigData, RecoverConfigErrors, RecoverConfigResponse, RecoverConfigResponses, RedactedThinkingContent, RemoveConfigData, RemoveConfigErrors, RemoveConfigResponse, RemoveConfigResponses, RemoveCustomProviderData, RemoveCustomProviderErrors, RemoveCustomProviderResponse, RemoveCustomProviderResponses, RemoveExtensionData, RemoveExtensionErrors, RemoveExtensionRequest, RemoveExtensionResponse, RemoveExtensionResponses, ReplyData, ReplyErrors, ReplyResponse, ReplyResponses, ResetPromptData, ResetPromptErrors, ResetPromptResponse, ResetPromptResponses, ResourceContents, ResourceMetadata, Response, RestartAgentData, RestartAgentErrors, RestartAgentRequest, RestartAgentResponse, RestartAgentResponse2, RestartAgentResponses, ResumeAgentData, ResumeAgentErrors, ResumeAgentRequest, ResumeAgentResponse, ResumeAgentResponse2, ResumeAgentResponses, RetryConfig, Role, RunNowHandlerData, RunNowHandlerErrors, RunNowHandlerResponse, RunNowHandlerResponses, RunNowResponse, SavePromptData, SavePromptErrors, SavePromptRequest, SavePromptResponse, SavePromptResponses, SaveRecipeData, SaveRecipeError, SaveRecipeErrors, SaveRecipeRequest, SaveRecipeResponse, SaveRecipeResponse2, SaveRecipeResponses, ScanRecipeData, ScanRecipeRequest, ScanRecipeResponse, ScanRecipeResponse2, ScanRecipeResponses, ScheduledJob, ScheduleRecipeData, ScheduleRecipeErrors, ScheduleRecipeRequest, ScheduleRecipeResponses, SendTelemetryEventData, SendTelemetryEventResponses, Session, SessionDisplayInfo, SessionExtensionsResponse, SessionInsights, SessionListResponse, SessionsHandlerData, SessionsHandlerErrors, SessionsHandlerResponse, SessionsHandlerResponses, SessionsQuery, SessionType, SetConfigProviderData, SetProviderRequest, SetRecipeSlashCommandData, SetRecipeSlashCommandErrors, SetRecipeSlashCommandResponses, SetSlashCommandRequest, Settings, SetupResponse, SlashCommand, SlashCommandsResponse, StartAgentData, StartAgentError, StartAgentErrors, StartAgentRequest, StartAgentResponse, StartAgentResponses, StartOpenrouterSetupData, StartOpenrouterSetupResponse, StartOpenrouterSetupResponses, StartTetrateSetupData, StartTetrateSetupResponse, StartTetrateSetupResponses, StartTunnelData, StartTunnelError, StartTunnelErrors, StartTunnelResponse, StartTunnelResponses, StatusData, StatusResponse, StatusResponses, StopAgentData, StopAgentErrors, StopAgentRequest, StopAgentResponse, StopAgentResponses, StopTunnelData, StopTunnelError, StopTunnelErrors, StopTunnelResponses, SubRecipe, SuccessCheck, SystemInfo, SystemInfoData, SystemInfoResponse, SystemInfoResponses, SystemNotificationContent, SystemNotificationType, TelemetryEventRequest, Template, TextContent, ThinkingContent, TokenState, Tool, ToolAnnotations, ToolConfirmationRequest, ToolInfo, ToolPermission, ToolRequest, ToolResponse, TranscribeDictationData, TranscribeDictationErrors, TranscribeDictationResponse, TranscribeDictationResponses, TranscribeRequest, TranscribeResponse, TunnelInfo, TunnelState, UiMetadata, UnpauseScheduleData, UnpauseScheduleErrors, UnpauseScheduleResponse, UnpauseScheduleResponses, UpdateAgentProviderData, UpdateAgentProviderErrors, UpdateAgentProviderResponses, UpdateCustomProviderData, UpdateCustomProviderErrors, UpdateCustomProviderRequest, UpdateCustomProviderResponse, UpdateCustomProviderResponses, UpdateFromSessionData, UpdateFromSessionErrors, UpdateFromSessionRequest, UpdateFromSessionResponses, UpdateProviderRequest, UpdateScheduleData, UpdateScheduleErrors, UpdateScheduleRequest, UpdateScheduleResponse, UpdateScheduleResponses, UpdateSessionNameData, UpdateSessionNameErrors, UpdateSessionNameRequest, UpdateSessionNameResponses, UpdateSessionUserRecipeValuesData, UpdateSessionUserRecipeValuesError, UpdateSessionUserRecipeValuesErrors, UpdateSessionUserRecipeValuesRequest, UpdateSessionUserRecipeValuesResponse, UpdateSessionUserRecipeValuesResponse2, UpdateSessionUserRecipeValuesResponses, UpdateWorkingDirData, UpdateWorkingDirErrors, UpdateWorkingDirRequest, UpdateWorkingDirResponses, UpsertConfigData, UpsertConfigErrors, UpsertConfigQuery, UpsertConfigResponse, UpsertConfigResponses, UpsertPermissionsData, UpsertPermissionsErrors, UpsertPermissionsQuery, UpsertPermissionsResponse, UpsertPermissionsResponses, ValidateConfigData, ValidateConfigErrors, ValidateConfigResponse, ValidateConfigResponses, WindowProps } from './types.gen';
diff --git a/ui/desktop/src/api/sdk.gen.ts b/ui/desktop/src/api/sdk.gen.ts
index 9ddecc308241..53af9618e058 100644
--- a/ui/desktop/src/api/sdk.gen.ts
+++ b/ui/desktop/src/api/sdk.gen.ts
@@ -2,7 +2,7 @@
import type { Client, Options as Options2, TDataShape } from './client';
import { client } from './client.gen';
-import type { AddExtensionData, AddExtensionErrors, AddExtensionResponses, AgentAddExtensionData, AgentAddExtensionErrors, AgentAddExtensionResponses, AgentRemoveExtensionData, AgentRemoveExtensionErrors, AgentRemoveExtensionResponses, BackupConfigData, BackupConfigErrors, BackupConfigResponses, CallToolData, CallToolErrors, CallToolResponses, CheckProviderData, ConfigureProviderOauthData, ConfigureProviderOauthErrors, ConfigureProviderOauthResponses, ConfirmToolActionData, ConfirmToolActionErrors, ConfirmToolActionResponses, CreateCustomProviderData, CreateCustomProviderErrors, CreateCustomProviderResponses, CreateRecipeData, CreateRecipeErrors, CreateRecipeResponses, CreateScheduleData, CreateScheduleErrors, CreateScheduleResponses, DecodeRecipeData, DecodeRecipeErrors, DecodeRecipeResponses, DeleteRecipeData, DeleteRecipeErrors, DeleteRecipeResponses, DeleteScheduleData, DeleteScheduleErrors, DeleteScheduleResponses, DeleteSessionData, DeleteSessionErrors, DeleteSessionResponses, DetectProviderData, DetectProviderErrors, DetectProviderResponses, DiagnosticsData, DiagnosticsErrors, DiagnosticsResponses, EncodeRecipeData, EncodeRecipeErrors, EncodeRecipeResponses, ExportAppData, ExportAppErrors, ExportAppResponses, ExportSessionData, ExportSessionErrors, ExportSessionResponses, ForkSessionData, ForkSessionErrors, ForkSessionResponses, GetCustomProviderData, GetCustomProviderErrors, GetCustomProviderResponses, GetExtensionsData, GetExtensionsErrors, GetExtensionsResponses, GetPricingData, GetPricingResponses, GetPromptData, GetPromptErrors, GetPromptResponses, GetPromptsData, GetPromptsResponses, GetProviderModelsData, GetProviderModelsErrors, GetProviderModelsResponses, GetSessionData, GetSessionErrors, GetSessionExtensionsData, GetSessionExtensionsErrors, GetSessionExtensionsResponses, GetSessionInsightsData, GetSessionInsightsErrors, GetSessionInsightsResponses, GetSessionResponses, GetSlashCommandsData, GetSlashCommandsResponses, GetToolsData, GetToolsErrors, GetToolsResponses, GetTunnelStatusData, GetTunnelStatusResponses, ImportAppData, ImportAppErrors, ImportAppResponses, ImportSessionData, ImportSessionErrors, ImportSessionResponses, InitConfigData, InitConfigErrors, InitConfigResponses, InspectRunningJobData, InspectRunningJobErrors, InspectRunningJobResponses, KillRunningJobData, KillRunningJobResponses, ListAppsData, ListAppsErrors, ListAppsResponses, ListRecipesData, ListRecipesErrors, ListRecipesResponses, ListSchedulesData, ListSchedulesErrors, ListSchedulesResponses, ListSessionsData, ListSessionsErrors, ListSessionsResponses, McpUiProxyData, McpUiProxyErrors, McpUiProxyResponses, ParseRecipeData, ParseRecipeErrors, ParseRecipeResponses, PauseScheduleData, PauseScheduleErrors, PauseScheduleResponses, ProvidersData, ProvidersResponses, ReadAllConfigData, ReadAllConfigResponses, ReadConfigData, ReadConfigErrors, ReadConfigResponses, ReadResourceData, ReadResourceErrors, ReadResourceResponses, RecipeToYamlData, RecipeToYamlErrors, RecipeToYamlResponses, RecoverConfigData, RecoverConfigErrors, RecoverConfigResponses, RemoveConfigData, RemoveConfigErrors, RemoveConfigResponses, RemoveCustomProviderData, RemoveCustomProviderErrors, RemoveCustomProviderResponses, RemoveExtensionData, RemoveExtensionErrors, RemoveExtensionResponses, ReplyData, ReplyErrors, ReplyResponses, ResetPromptData, ResetPromptErrors, ResetPromptResponses, RestartAgentData, RestartAgentErrors, RestartAgentResponses, ResumeAgentData, ResumeAgentErrors, ResumeAgentResponses, RunNowHandlerData, RunNowHandlerErrors, RunNowHandlerResponses, SavePromptData, SavePromptErrors, SavePromptResponses, SaveRecipeData, SaveRecipeErrors, SaveRecipeResponses, ScanRecipeData, ScanRecipeResponses, ScheduleRecipeData, ScheduleRecipeErrors, ScheduleRecipeResponses, SendTelemetryEventData, SendTelemetryEventResponses, SessionsHandlerData, SessionsHandlerErrors, SessionsHandlerResponses, SetConfigProviderData, SetRecipeSlashCommandData, SetRecipeSlashCommandErrors, SetRecipeSlashCommandResponses, StartAgentData, StartAgentErrors, StartAgentResponses, StartOpenrouterSetupData, StartOpenrouterSetupResponses, StartTetrateSetupData, StartTetrateSetupResponses, StartTunnelData, StartTunnelErrors, StartTunnelResponses, StatusData, StatusResponses, StopAgentData, StopAgentErrors, StopAgentResponses, StopTunnelData, StopTunnelErrors, StopTunnelResponses, SystemInfoData, SystemInfoResponses, UnpauseScheduleData, UnpauseScheduleErrors, UnpauseScheduleResponses, UpdateAgentProviderData, UpdateAgentProviderErrors, UpdateAgentProviderResponses, UpdateCustomProviderData, UpdateCustomProviderErrors, UpdateCustomProviderResponses, UpdateFromSessionData, UpdateFromSessionErrors, UpdateFromSessionResponses, UpdateScheduleData, UpdateScheduleErrors, UpdateScheduleResponses, UpdateSessionNameData, UpdateSessionNameErrors, UpdateSessionNameResponses, UpdateSessionUserRecipeValuesData, UpdateSessionUserRecipeValuesErrors, UpdateSessionUserRecipeValuesResponses, UpdateWorkingDirData, UpdateWorkingDirErrors, UpdateWorkingDirResponses, UpsertConfigData, UpsertConfigErrors, UpsertConfigResponses, UpsertPermissionsData, UpsertPermissionsErrors, UpsertPermissionsResponses, ValidateConfigData, ValidateConfigErrors, ValidateConfigResponses } from './types.gen';
+import type { AddExtensionData, AddExtensionErrors, AddExtensionResponses, AgentAddExtensionData, AgentAddExtensionErrors, AgentAddExtensionResponses, AgentRemoveExtensionData, AgentRemoveExtensionErrors, AgentRemoveExtensionResponses, BackupConfigData, BackupConfigErrors, BackupConfigResponses, CallToolData, CallToolErrors, CallToolResponses, CheckProviderData, ConfigureProviderOauthData, ConfigureProviderOauthErrors, ConfigureProviderOauthResponses, ConfirmToolActionData, ConfirmToolActionErrors, ConfirmToolActionResponses, CreateCustomProviderData, CreateCustomProviderErrors, CreateCustomProviderResponses, CreateRecipeData, CreateRecipeErrors, CreateRecipeResponses, CreateScheduleData, CreateScheduleErrors, CreateScheduleResponses, DecodeRecipeData, DecodeRecipeErrors, DecodeRecipeResponses, DeleteRecipeData, DeleteRecipeErrors, DeleteRecipeResponses, DeleteScheduleData, DeleteScheduleErrors, DeleteScheduleResponses, DeleteSessionData, DeleteSessionErrors, DeleteSessionResponses, DetectProviderData, DetectProviderErrors, DetectProviderResponses, DiagnosticsData, DiagnosticsErrors, DiagnosticsResponses, EncodeRecipeData, EncodeRecipeErrors, EncodeRecipeResponses, ExportAppData, ExportAppErrors, ExportAppResponses, ExportSessionData, ExportSessionErrors, ExportSessionResponses, ForkSessionData, ForkSessionErrors, ForkSessionResponses, GetCustomProviderData, GetCustomProviderErrors, GetCustomProviderResponses, GetDictationConfigData, GetDictationConfigResponses, GetExtensionsData, GetExtensionsErrors, GetExtensionsResponses, GetPricingData, GetPricingResponses, GetPromptData, GetPromptErrors, GetPromptResponses, GetPromptsData, GetPromptsResponses, GetProviderModelsData, GetProviderModelsErrors, GetProviderModelsResponses, GetSessionData, GetSessionErrors, GetSessionExtensionsData, GetSessionExtensionsErrors, GetSessionExtensionsResponses, GetSessionInsightsData, GetSessionInsightsErrors, GetSessionInsightsResponses, GetSessionResponses, GetSlashCommandsData, GetSlashCommandsResponses, GetToolsData, GetToolsErrors, GetToolsResponses, GetTunnelStatusData, GetTunnelStatusResponses, ImportAppData, ImportAppErrors, ImportAppResponses, ImportSessionData, ImportSessionErrors, ImportSessionResponses, InitConfigData, InitConfigErrors, InitConfigResponses, InspectRunningJobData, InspectRunningJobErrors, InspectRunningJobResponses, KillRunningJobData, KillRunningJobResponses, ListAppsData, ListAppsErrors, ListAppsResponses, ListRecipesData, ListRecipesErrors, ListRecipesResponses, ListSchedulesData, ListSchedulesErrors, ListSchedulesResponses, ListSessionsData, ListSessionsErrors, ListSessionsResponses, McpUiProxyData, McpUiProxyErrors, McpUiProxyResponses, ParseRecipeData, ParseRecipeErrors, ParseRecipeResponses, PauseScheduleData, PauseScheduleErrors, PauseScheduleResponses, ProvidersData, ProvidersResponses, ReadAllConfigData, ReadAllConfigResponses, ReadConfigData, ReadConfigErrors, ReadConfigResponses, ReadResourceData, ReadResourceErrors, ReadResourceResponses, RecipeToYamlData, RecipeToYamlErrors, RecipeToYamlResponses, RecoverConfigData, RecoverConfigErrors, RecoverConfigResponses, RemoveConfigData, RemoveConfigErrors, RemoveConfigResponses, RemoveCustomProviderData, RemoveCustomProviderErrors, RemoveCustomProviderResponses, RemoveExtensionData, RemoveExtensionErrors, RemoveExtensionResponses, ReplyData, ReplyErrors, ReplyResponses, ResetPromptData, ResetPromptErrors, ResetPromptResponses, RestartAgentData, RestartAgentErrors, RestartAgentResponses, ResumeAgentData, ResumeAgentErrors, ResumeAgentResponses, RunNowHandlerData, RunNowHandlerErrors, RunNowHandlerResponses, SavePromptData, SavePromptErrors, SavePromptResponses, SaveRecipeData, SaveRecipeErrors, SaveRecipeResponses, ScanRecipeData, ScanRecipeResponses, ScheduleRecipeData, ScheduleRecipeErrors, ScheduleRecipeResponses, SendTelemetryEventData, SendTelemetryEventResponses, SessionsHandlerData, SessionsHandlerErrors, SessionsHandlerResponses, SetConfigProviderData, SetRecipeSlashCommandData, SetRecipeSlashCommandErrors, SetRecipeSlashCommandResponses, StartAgentData, StartAgentErrors, StartAgentResponses, StartOpenrouterSetupData, StartOpenrouterSetupResponses, StartTetrateSetupData, StartTetrateSetupResponses, StartTunnelData, StartTunnelErrors, StartTunnelResponses, StatusData, StatusResponses, StopAgentData, StopAgentErrors, StopAgentResponses, StopTunnelData, StopTunnelErrors, StopTunnelResponses, SystemInfoData, SystemInfoResponses, TranscribeDictationData, TranscribeDictationErrors, TranscribeDictationResponses, UnpauseScheduleData, UnpauseScheduleErrors, UnpauseScheduleResponses, UpdateAgentProviderData, UpdateAgentProviderErrors, UpdateAgentProviderResponses, UpdateCustomProviderData, UpdateCustomProviderErrors, UpdateCustomProviderResponses, UpdateFromSessionData, UpdateFromSessionErrors, UpdateFromSessionResponses, UpdateScheduleData, UpdateScheduleErrors, UpdateScheduleResponses, UpdateSessionNameData, UpdateSessionNameErrors, UpdateSessionNameResponses, UpdateSessionUserRecipeValuesData, UpdateSessionUserRecipeValuesErrors, UpdateSessionUserRecipeValuesResponses, UpdateWorkingDirData, UpdateWorkingDirErrors, UpdateWorkingDirResponses, UpsertConfigData, UpsertConfigErrors, UpsertConfigResponses, UpsertPermissionsData, UpsertPermissionsErrors, UpsertPermissionsResponses, ValidateConfigData, ValidateConfigErrors, ValidateConfigResponses } from './types.gen';
export type Options = Options2 & {
/**
@@ -283,6 +283,17 @@ export const validateConfig = (options?: O
export const diagnostics = (options: Options) => (options.client ?? client).get({ url: '/diagnostics/{session_id}', ...options });
+export const getDictationConfig = (options?: Options) => (options?.client ?? client).get({ url: '/dictation/config', ...options });
+
+export const transcribeDictation = (options: Options) => (options.client ?? client).post({
+ url: '/dictation/transcribe',
+ ...options,
+ headers: {
+ 'Content-Type': 'application/json',
+ ...options.headers
+ }
+});
+
export const startOpenrouterSetup = (options?: Options) => (options?.client ?? client).post({ url: '/handle_openrouter', ...options });
export const startTetrateSetup = (options?: Options) => (options?.client ?? client).post({ url: '/handle_tetrate', ...options });
diff --git a/ui/desktop/src/api/types.gen.ts b/ui/desktop/src/api/types.gen.ts
index 2bc51d6010af..08ff21ec1319 100644
--- a/ui/desktop/src/api/types.gen.ts
+++ b/ui/desktop/src/api/types.gen.ts
@@ -190,6 +190,35 @@ export type DetectProviderResponse = {
provider_name: string;
};
+export type DictationProvider = 'openai' | 'elevenlabs';
+
+export type DictationProviderStatus = {
+ /**
+ * Config key name if uses_provider_config is false
+ */
+ config_key?: string | null;
+ /**
+ * Whether the provider is fully configured and ready to use
+ */
+ configured: boolean;
+ /**
+ * Description of what this provider does
+ */
+ description: string;
+ /**
+ * Custom host URL if configured (only for providers that support it)
+ */
+ host?: string | null;
+ /**
+ * Path to settings if uses_provider_config is true
+ */
+ settings_path?: string | null;
+ /**
+ * Whether this provider uses the main provider config (true) or has its own key (false)
+ */
+ uses_provider_config: boolean;
+};
+
export type EmbeddedResource = {
_meta?: {
[key: string]: unknown;
@@ -1173,6 +1202,25 @@ export type ToolResponse = {
};
};
+export type TranscribeRequest = {
+ /**
+ * Base64 encoded audio data
+ */
+ audio: string;
+ /**
+ * MIME type of the audio (e.g., "audio/webm", "audio/wav")
+ */
+ mime_type: string;
+ provider: DictationProvider;
+};
+
+export type TranscribeResponse = {
+ /**
+ * Transcribed text from the audio
+ */
+ text: string;
+};
+
export type TunnelInfo = {
hostname: string;
secret: string;
@@ -2456,6 +2504,79 @@ export type DiagnosticsResponses = {
export type DiagnosticsResponse = DiagnosticsResponses[keyof DiagnosticsResponses];
+export type GetDictationConfigData = {
+ body?: never;
+ path?: never;
+ query?: never;
+ url: '/dictation/config';
+};
+
+export type GetDictationConfigResponses = {
+ /**
+ * Audio transcription provider configurations
+ */
+ 200: {
+ [key: string]: DictationProviderStatus;
+ };
+};
+
+export type GetDictationConfigResponse = GetDictationConfigResponses[keyof GetDictationConfigResponses];
+
+export type TranscribeDictationData = {
+ body: TranscribeRequest;
+ path?: never;
+ query?: never;
+ url: '/dictation/transcribe';
+};
+
+export type TranscribeDictationErrors = {
+ /**
+ * Invalid request (bad base64 or unsupported format)
+ */
+ 400: unknown;
+ /**
+ * Invalid API key
+ */
+ 401: unknown;
+ /**
+ * DictationProvider not configured
+ */
+ 412: unknown;
+ /**
+ * Audio file too large (max 25MB)
+ */
+ 413: unknown;
+ /**
+ * Rate limit exceeded
+ */
+ 429: unknown;
+ /**
+ * Internal server error
+ */
+ 500: unknown;
+ /**
+ * DictationProvider API error
+ */
+ 502: unknown;
+ /**
+ * Service unavailable
+ */
+ 503: unknown;
+ /**
+ * Request timeout
+ */
+ 504: unknown;
+};
+
+export type TranscribeDictationResponses = {
+ /**
+ * Audio transcribed successfully
+ */
+ 200: TranscribeResponse;
+};
+
+export type TranscribeDictationResponse = TranscribeDictationResponses[keyof TranscribeDictationResponses];
+
export type StartOpenrouterSetupData = {
body?: never;
path?: never;
diff --git a/ui/desktop/src/components/ChatInput.tsx b/ui/desktop/src/components/ChatInput.tsx
index 638d362779dc..4c8e2e99c68d 100644
--- a/ui/desktop/src/components/ChatInput.tsx
+++ b/ui/desktop/src/components/ChatInput.tsx
@@ -16,12 +16,10 @@ import { BottomMenuExtensionSelection } from './bottom_menu/BottomMenuExtensionS
import { AlertType, useAlerts } from './alerts';
import { useConfig } from './ConfigContext';
import { useModelAndProvider } from './ModelAndProviderContext';
-import { useWhisper } from '../hooks/useWhisper';
-import { DICTATION_PROVIDER_ELEVENLABS } from '../hooks/dictationConstants';
-import { WaveformVisualizer } from './WaveformVisualizer';
+import { useAudioRecorder } from '../hooks/useAudioRecorder';
import { toastError } from '../toasts';
import MentionPopover, { DisplayItemWithMatch } from './MentionPopover';
-import { COST_TRACKING_ENABLED, VOICE_DICTATION_ELEVENLABS_ENABLED } from '../updates';
+import { COST_TRACKING_ENABLED } from '../updates';
import { CostTracker } from './bottom_menu/CostTracker';
import { DroppedFile, useFileDrop } from '../hooks/useFileDrop';
import { Recipe } from '../recipe';
@@ -254,19 +252,17 @@ export default function ChatInput({
selectFile: (index: number) => void;
}>(null);
- // Whisper hook for voice dictation
+ // Audio recorder hook for voice dictation
const {
+ isEnabled,
+ dictationProvider,
isRecording,
isTranscribing,
- canUseDictation,
- audioContext,
- analyser,
startRecording,
stopRecording,
recordingDuration,
estimatedSize,
- dictationSettings,
- } = useWhisper({
+ } = useAudioRecorder({
onTranscription: (text) => {
trackVoiceDictation('transcribed');
// Append transcribed text to the current input
@@ -283,12 +279,6 @@ export default function ChatInput({
msg: error.message,
});
},
- onSizeWarning: (sizeMB) => {
- toastError({
- title: 'Recording Size Warning',
- msg: `Recording is ${sizeMB.toFixed(1)}MB. Maximum size is 25MB.`,
- });
- },
});
const internalTextAreaRef = useRef(null);
const textAreaRef = inputRef || internalTextAreaRef;
@@ -1236,26 +1226,25 @@ export default function ChatInput({
maxHeight: `${maxHeight}px`,
overflowY: 'auto',
opacity: isRecording ? 0 : 1,
- paddingRight: dictationSettings?.enabled ? '180px' : '120px',
+ paddingRight: dictationProvider ? '180px' : '120px',
}}
className="w-full outline-none border-none focus:ring-0 bg-transparent px-3 pt-3 pb-1.5 text-sm resize-none text-textStandard placeholder:text-textPlaceholder"
/>
{isRecording && (
)}
{/* Inline action buttons - absolutely positioned on the right */}
- {/* Microphone button - show only if dictation is enabled */}
- {dictationSettings?.enabled && (
+ {/* Microphone button - show only if provider is selected */}
+ {dictationProvider && (
<>
- {!canUseDictation ? (
+ {!isEnabled ? (
@@ -1273,18 +1262,17 @@ export default function ChatInput({
- {dictationSettings.provider === 'openai' ? (
+ {dictationProvider === 'openai' ? (
OpenAI API key is not configured. Set it up in Settings {'>'}{' '}
Models.
- ) : VOICE_DICTATION_ELEVENLABS_ENABLED &&
- dictationSettings.provider === DICTATION_PROVIDER_ELEVENLABS ? (
+ ) : dictationProvider === 'elevenlabs' ? (
ElevenLabs API key is not configured. Set it up in Settings {'>'}{' '}
Chat {'>'} Voice Dictation.
- ) : dictationSettings.provider === null ? (
+ ) : dictationProvider === null ? (
Dictation is not configured. Configure it in Settings {'>'}{' '}
Chat {'>'} Voice Dictation.
diff --git a/ui/desktop/src/components/settings/chat/ChatSettingsSection.tsx b/ui/desktop/src/components/settings/chat/ChatSettingsSection.tsx
index fc7ae5acd61e..ab630ecd96cf 100644
--- a/ui/desktop/src/components/settings/chat/ChatSettingsSection.tsx
+++ b/ui/desktop/src/components/settings/chat/ChatSettingsSection.tsx
@@ -1,5 +1,5 @@
import { ModeSection } from '../mode/ModeSection';
-import DictationSection from '../dictation/DictationSection';
+import { DictationSettings } from '../dictation/DictationSettings';
import { SecurityToggle } from '../security/SecurityToggle';
import { ResponseStylesSection } from '../response_styles/ResponseStylesSection';
import { GoosehintsSection } from './GoosehintsSection';
@@ -21,30 +21,30 @@ export default function ChatSettingsSection() {
-
+
-
- Response Styles
- Choose how Goose should format and style its responses
-
-
+
+
+
+ Response Styles
+ Choose how Goose should format and style its responses
+
-
-
+
-
+
diff --git a/ui/desktop/src/components/settings/dictation/DictationSection.tsx b/ui/desktop/src/components/settings/dictation/DictationSection.tsx
deleted file mode 100644
index a4abcfcc087d..000000000000
--- a/ui/desktop/src/components/settings/dictation/DictationSection.tsx
+++ /dev/null
@@ -1,5 +0,0 @@
-import { VoiceDictationToggle } from './VoiceDictationToggle';
-
-export default function DictationSection() {
- return ;
-}
diff --git a/ui/desktop/src/components/settings/dictation/DictationSettings.tsx b/ui/desktop/src/components/settings/dictation/DictationSettings.tsx
new file mode 100644
index 000000000000..482fcb31f5c7
--- /dev/null
+++ b/ui/desktop/src/components/settings/dictation/DictationSettings.tsx
@@ -0,0 +1,249 @@
+import { useState, useEffect } from 'react';
+import { ChevronDown } from 'lucide-react';
+import { DictationProvider, getDictationConfig, DictationProviderStatus } from '../../../api';
+import { useConfig } from '../../ConfigContext';
+import { Input } from '../../ui/input';
+import { Button } from '../../ui/button';
+import { trackSettingToggled } from '../../../utils/analytics';
+
+export const DictationSettings = () => {
+ const [provider, setProvider] = useState(null);
+ const [showProviderDropdown, setShowProviderDropdown] = useState(false);
+ const [providerStatuses, setProviderStatuses] = useState>(
+ {}
+ );
+ const [apiKey, setApiKey] = useState('');
+ const [isEditingKey, setIsEditingKey] = useState(false);
+ const [keyValidationError, setKeyValidationError] = useState('');
+ const { read, upsert, remove } = useConfig();
+
+ useEffect(() => {
+ const loadSettings = async () => {
+ try {
+ const providerValue = await read('voice_dictation_provider', false);
+ const loadedProvider: DictationProvider | null =
+ (providerValue as DictationProvider) || null;
+ setProvider(loadedProvider);
+
+ // Get provider configuration status from backend
+ const audioConfig = await getDictationConfig();
+ setProviderStatuses(audioConfig.data || {});
+ } catch (error) {
+ console.error('Error loading dictation settings:', error);
+ }
+ };
+
+ loadSettings();
+ }, [read]);
+
+ const saveProvider = async (newProvider: DictationProvider | null) => {
+ console.log('Saving dictation provider to backend config:', newProvider);
+ setProvider(newProvider);
+ try {
+ await upsert('voice_dictation_provider', newProvider || '', false);
+ trackSettingToggled('voice_dictation', newProvider !== null);
+ } catch (error) {
+ console.error('Error saving dictation provider:', error);
+ }
+ };
+
+ const handleProviderChange = (newProvider: DictationProvider | null) => {
+ saveProvider(newProvider);
+ setShowProviderDropdown(false);
+ };
+
+ const handleDropdownToggle = async () => {
+ const newShowState = !showProviderDropdown;
+ setShowProviderDropdown(newShowState);
+
+ if (newShowState) {
+ try {
+ const audioConfig = await getDictationConfig();
+ setProviderStatuses(audioConfig.data || {});
+ } catch (error) {
+ console.error('Error checking provider configuration:', error);
+ }
+ }
+ };
+
+ const handleSaveKey = async () => {
+ if (!provider) return;
+ const providerConfig = providerStatuses[provider];
+ if (!providerConfig || providerConfig.uses_provider_config) return;
+
+ const trimmedKey = apiKey.trim();
+ if (!trimmedKey) {
+ setKeyValidationError('API key is required');
+ return;
+ }
+
+ try {
+ const keyName = providerConfig.config_key!;
+ await upsert(keyName, trimmedKey, true);
+ setApiKey('');
+ setKeyValidationError('');
+ setIsEditingKey(false);
+
+ const audioConfig = await getDictationConfig();
+ setProviderStatuses(audioConfig.data || {});
+ } catch (error) {
+ console.error('Error saving API key:', error);
+ setKeyValidationError('Failed to save API key');
+ }
+ };
+
+ const handleRemoveKey = async () => {
+ if (!provider) return;
+ const providerConfig = providerStatuses[provider];
+ if (!providerConfig || providerConfig.uses_provider_config) return;
+
+ try {
+ const keyName = providerConfig.config_key!;
+ await remove(keyName, true);
+ setApiKey('');
+ setKeyValidationError('');
+ setIsEditingKey(false);
+
+ const audioConfig = await getDictationConfig();
+ setProviderStatuses(audioConfig.data || {});
+ } catch (error) {
+ console.error('Error removing API key:', error);
+ setKeyValidationError('Failed to remove API key');
+ }
+ };
+
+ const handleCancelEdit = () => {
+ setApiKey('');
+ setKeyValidationError('');
+ setIsEditingKey(false);
+ };
+
+ const getProviderLabel = (provider: DictationProvider | null): string => {
+ if (!provider) return 'Disabled';
+ return provider.charAt(0).toUpperCase() + provider.slice(1);
+ };
+
+ return (
+
+
+
+
Voice Dictation Provider
+
+ Choose how voice is converted to text
+
+
+
+
+
+ {showProviderDropdown && (
+
+
+
+ {(Object.keys(providerStatuses) as DictationProvider[]).map((p) => (
+
+ ))}
+
+ )}
+
+
+
+ {provider && providerStatuses[provider] && (
+ <>
+
+
{providerStatuses[provider].description}
+
+
+ {providerStatuses[provider].uses_provider_config ? (
+
+ {!providerStatuses[provider].configured ? (
+
+ Configure the API key in {providerStatuses[provider].settings_path}
+
+ ) : (
+
+ ✓ Configured in {providerStatuses[provider].settings_path}
+
+ )}
+
+ ) : (
+
+
+
API Key
+
+ Required for transcription
+ {providerStatuses[provider]?.configured && (
+ (Configured)
+ )}
+
+
+
+ {!isEditingKey ? (
+
+ ) : (
+
+
{
+ setApiKey(e.target.value);
+ if (keyValidationError) setKeyValidationError('');
+ }}
+ placeholder="Enter your API key"
+ className="max-w-md"
+ autoFocus
+ />
+ {keyValidationError && (
+
{keyValidationError}
+ )}
+
+
+
+ {providerStatuses[provider]?.configured && (
+
+ )}
+
+
+ )}
+
+ )}
+ >
+ )}
+
+ );
+};
diff --git a/ui/desktop/src/components/settings/dictation/ElevenLabsKeyInput.tsx b/ui/desktop/src/components/settings/dictation/ElevenLabsKeyInput.tsx
deleted file mode 100644
index 1856a8c5e5c5..000000000000
--- a/ui/desktop/src/components/settings/dictation/ElevenLabsKeyInput.tsx
+++ /dev/null
@@ -1,128 +0,0 @@
-import { useState, useEffect, useCallback } from 'react';
-import { Input } from '../../ui/input';
-import { Button } from '../../ui/button';
-import { useConfig } from '../../ConfigContext';
-import { ELEVENLABS_API_KEY, isSecretKeyConfigured } from '../../../hooks/dictationConstants';
-import { setElevenLabsKeyCache } from '../../../hooks/useDictationSettings';
-
-export const ElevenLabsKeyInput = () => {
- const [elevenLabsApiKey, setElevenLabsApiKey] = useState('');
- const [isLoadingKey, setIsLoadingKey] = useState(false);
- const [hasElevenLabsKey, setHasElevenLabsKey] = useState(false);
- const [validationError, setValidationError] = useState('');
- const [isEditing, setIsEditing] = useState(false);
- const { upsert, read, remove } = useConfig();
-
- const loadKey = useCallback(async () => {
- setIsLoadingKey(true);
- try {
- const response = await read(ELEVENLABS_API_KEY, true);
- const hasKey = isSecretKeyConfigured(response);
- setHasElevenLabsKey(hasKey);
- setElevenLabsKeyCache(hasKey);
- } catch (error) {
- console.error(error);
- setElevenLabsKeyCache(false);
- } finally {
- setIsLoadingKey(false);
- }
- }, [read]);
-
- useEffect(() => {
- loadKey();
- }, [loadKey]);
-
- const handleElevenLabsKeyChange = (key: string) => {
- setElevenLabsApiKey(key);
- if (validationError) {
- setValidationError('');
- }
- };
-
- const handleSave = async () => {
- try {
- const trimmedKey = elevenLabsApiKey.trim();
-
- if (!trimmedKey) {
- setValidationError('API key is required');
- return;
- }
-
- await upsert(ELEVENLABS_API_KEY, trimmedKey, true);
- setElevenLabsApiKey('');
- setValidationError('');
- setIsEditing(false);
- await loadKey();
- } catch (error) {
- console.error(error);
- setValidationError('Failed to save API key');
- }
- };
-
- const handleRemove = async () => {
- try {
- await remove(ELEVENLABS_API_KEY, true);
- await loadKey();
- setElevenLabsApiKey('');
- setValidationError('');
- setIsEditing(false);
- } catch (error) {
- console.error(error);
- setValidationError('Failed to remove API key');
- }
- };
-
- const handleCancel = () => {
- setElevenLabsApiKey('');
- setValidationError('');
- setIsEditing(false);
- };
-
- return (
-
-
-
ElevenLabs API Key
-
- Required for ElevenLabs voice recognition
- {hasElevenLabsKey && (Configured)}
-
-
-
- {!isEditing ? (
-
- ) : (
-
-
handleElevenLabsKeyChange(e.target.value)}
- placeholder="Enter your ElevenLabs API key"
- className="max-w-md"
- autoFocus
- />
- {validationError &&
{validationError}
}
-
-
-
- {hasElevenLabsKey && (
-
- )}
-
-
- )}
-
- );
-};
diff --git a/ui/desktop/src/components/settings/dictation/ProviderInfo.tsx b/ui/desktop/src/components/settings/dictation/ProviderInfo.tsx
deleted file mode 100644
index 187f790d1f2f..000000000000
--- a/ui/desktop/src/components/settings/dictation/ProviderInfo.tsx
+++ /dev/null
@@ -1,41 +0,0 @@
-import { DictationProvider } from '../../../hooks/useDictationSettings';
-import { DICTATION_PROVIDER_ELEVENLABS } from '../../../hooks/dictationConstants';
-import { VOICE_DICTATION_ELEVENLABS_ENABLED } from '../../../updates';
-
-interface ProviderInfoProps {
- provider: DictationProvider;
-}
-
-export const ProviderInfo = ({ provider }: ProviderInfoProps) => {
- if (!provider) return null;
-
- return (
-
- {provider === 'openai' && (
-
- Uses OpenAI's Whisper API for high-quality transcription. Requires an OpenAI API key
- configured in the Models section.
-
- )}
- {VOICE_DICTATION_ELEVENLABS_ENABLED && provider === DICTATION_PROVIDER_ELEVENLABS && (
-
-
- Uses ElevenLabs speech-to-text API for high-quality transcription.
-
-
- Features:
-
-
- - Advanced voice processing
- - High accuracy transcription
- - Multiple language support
- - Fast processing
-
-
- Note: Requires an ElevenLabs API key with speech-to-text access.
-
-
- )}
-
- );
-};
diff --git a/ui/desktop/src/components/settings/dictation/ProviderSelector.tsx b/ui/desktop/src/components/settings/dictation/ProviderSelector.tsx
deleted file mode 100644
index 4b3878509a06..000000000000
--- a/ui/desktop/src/components/settings/dictation/ProviderSelector.tsx
+++ /dev/null
@@ -1,128 +0,0 @@
-import { useState, useEffect } from 'react';
-import { ChevronDown } from 'lucide-react';
-import { DictationProvider, DictationSettings } from '../../../hooks/useDictationSettings';
-import {
- DICTATION_PROVIDER_OPENAI,
- DICTATION_PROVIDER_ELEVENLABS,
-} from '../../../hooks/dictationConstants';
-import { useConfig } from '../../ConfigContext';
-import { ElevenLabsKeyInput } from './ElevenLabsKeyInput';
-import { ProviderInfo } from './ProviderInfo';
-import { VOICE_DICTATION_ELEVENLABS_ENABLED } from '../../../updates';
-
-interface ProviderSelectorProps {
- settings: DictationSettings;
- onProviderChange: (provider: DictationProvider) => void;
-}
-
-export const ProviderSelector = ({ settings, onProviderChange }: ProviderSelectorProps) => {
- const [hasOpenAIKey, setHasOpenAIKey] = useState(false);
- const [showProviderDropdown, setShowProviderDropdown] = useState(false);
- const { getProviders } = useConfig();
-
- useEffect(() => {
- const checkOpenAIKey = async () => {
- try {
- const providers = await getProviders(false);
- const openAIProvider = providers.find((p) => p.name === 'openai');
- setHasOpenAIKey(openAIProvider?.is_configured || false);
- } catch (error) {
- console.error('Error checking OpenAI configuration:', error);
- setHasOpenAIKey(false);
- }
- };
-
- checkOpenAIKey();
- }, [getProviders]);
-
- const handleDropdownToggle = async () => {
- const newShowState = !showProviderDropdown;
- setShowProviderDropdown(newShowState);
-
- if (newShowState) {
- try {
- const providers = await getProviders(true);
- const openAIProvider = providers.find((p) => p.name === 'openai');
- const isConfigured = !!openAIProvider?.is_configured;
- setHasOpenAIKey(isConfigured);
- } catch (error) {
- console.error('Error checking OpenAI configuration:', error);
- setHasOpenAIKey(false);
- }
- }
- };
-
- const handleProviderChange = (provider: DictationProvider) => {
- onProviderChange(provider);
- setShowProviderDropdown(false);
- };
-
- const getProviderLabel = (provider: DictationProvider): string => {
- switch (provider) {
- case DICTATION_PROVIDER_OPENAI:
- return 'OpenAI Whisper';
- case DICTATION_PROVIDER_ELEVENLABS:
- return 'ElevenLabs';
- default:
- return 'None (disabled)';
- }
- };
-
- return (
-
-
-
-
Dictation Provider
-
- Choose how voice is converted to text
-
-
-
-
-
- {showProviderDropdown && (
-
-
-
- {VOICE_DICTATION_ELEVENLABS_ENABLED && (
-
- )}
-
- )}
-
-
-
- {VOICE_DICTATION_ELEVENLABS_ENABLED &&
- settings.provider === DICTATION_PROVIDER_ELEVENLABS &&
}
-
-
-
- );
-};
diff --git a/ui/desktop/src/components/settings/dictation/VoiceDictationToggle.tsx b/ui/desktop/src/components/settings/dictation/VoiceDictationToggle.tsx
deleted file mode 100644
index 6af1c71c8916..000000000000
--- a/ui/desktop/src/components/settings/dictation/VoiceDictationToggle.tsx
+++ /dev/null
@@ -1,97 +0,0 @@
-import { useState, useEffect } from 'react';
-import { Switch } from '../../ui/switch';
-import { DictationProvider, DictationSettings } from '../../../hooks/useDictationSettings';
-import {
- DICTATION_SETTINGS_KEY,
- DICTATION_PROVIDER_OPENAI,
- DICTATION_PROVIDER_ELEVENLABS,
- getDefaultDictationSettings,
-} from '../../../hooks/dictationConstants';
-import { useConfig } from '../../ConfigContext';
-import { ProviderSelector } from './ProviderSelector';
-import { VOICE_DICTATION_ELEVENLABS_ENABLED } from '../../../updates';
-import { trackSettingToggled } from '../../../utils/analytics';
-
-export const VoiceDictationToggle = () => {
- const [settings, setSettings] = useState({
- enabled: false,
- provider: null,
- });
- const { getProviders } = useConfig();
-
- useEffect(() => {
- const loadSettings = async () => {
- const savedSettings = localStorage.getItem(DICTATION_SETTINGS_KEY);
-
- let loadedSettings: DictationSettings;
-
- if (savedSettings) {
- const parsed = JSON.parse(savedSettings);
- loadedSettings = parsed;
-
- // If ElevenLabs is disabled and user has it selected, reset to OpenAI
- if (
- !VOICE_DICTATION_ELEVENLABS_ENABLED &&
- loadedSettings.provider === DICTATION_PROVIDER_ELEVENLABS
- ) {
- loadedSettings = {
- ...loadedSettings,
- provider: DICTATION_PROVIDER_OPENAI,
- };
- localStorage.setItem(DICTATION_SETTINGS_KEY, JSON.stringify(loadedSettings));
- }
- } else {
- loadedSettings = await getDefaultDictationSettings(getProviders);
- }
-
- setSettings(loadedSettings);
- };
-
- loadSettings();
- }, [getProviders]);
-
- const saveSettings = (newSettings: DictationSettings) => {
- console.log('Saving dictation settings to localStorage:', newSettings);
- setSettings(newSettings);
- localStorage.setItem(DICTATION_SETTINGS_KEY, JSON.stringify(newSettings));
- };
-
- const handleToggle = (enabled: boolean) => {
- saveSettings({
- ...settings,
- enabled,
- provider: settings.provider === null ? DICTATION_PROVIDER_OPENAI : settings.provider,
- });
- trackSettingToggled('voice_dictation', enabled);
- };
-
- const handleProviderChange = (provider: DictationProvider) => {
- saveSettings({ ...settings, provider });
- };
-
- return (
-
-
-
-
Enable Voice Dictation
-
- Show microphone button for voice input
-
-
-
-
-
-
-
-
-
- );
-};
diff --git a/ui/desktop/src/hooks/dictationConstants.ts b/ui/desktop/src/hooks/dictationConstants.ts
deleted file mode 100644
index 1863c6decace..000000000000
--- a/ui/desktop/src/hooks/dictationConstants.ts
+++ /dev/null
@@ -1,31 +0,0 @@
-import { DictationSettings, DictationProvider } from './useDictationSettings';
-
-export const DICTATION_SETTINGS_KEY = 'dictation_settings';
-export const ELEVENLABS_API_KEY = 'ELEVENLABS_API_KEY';
-export const DICTATION_PROVIDER_OPENAI = 'openai' as const;
-export const DICTATION_PROVIDER_ELEVENLABS = 'elevenlabs' as const;
-
-export const isSecretKeyConfigured = (response: unknown): boolean =>
- typeof response === 'object' &&
- response !== null &&
- 'maskedValue' in response &&
- !!(response as { maskedValue: string }).maskedValue;
-
-export const getDefaultDictationSettings = async (
- getProviders: (refresh: boolean) => Promise>
-): Promise => {
- const providers = await getProviders(false);
- const openAIProvider = providers.find((p) => p.name === 'openai');
-
- if (openAIProvider && openAIProvider.is_configured) {
- return {
- enabled: true,
- provider: DICTATION_PROVIDER_OPENAI,
- };
- } else {
- return {
- enabled: false,
- provider: null as DictationProvider,
- };
- }
-};
diff --git a/ui/desktop/src/hooks/useAudioRecorder.ts b/ui/desktop/src/hooks/useAudioRecorder.ts
new file mode 100644
index 000000000000..151bcdf74c1b
--- /dev/null
+++ b/ui/desktop/src/hooks/useAudioRecorder.ts
@@ -0,0 +1,249 @@
+import { useState, useRef, useCallback, useEffect } from 'react';
+import { transcribeDictation, getDictationConfig, DictationProvider } from '../api';
+import { useConfig } from '../components/ConfigContext';
+
+interface UseAudioRecorderOptions {
+ onTranscription?: (text: string) => void;
+ onError?: (error: Error) => void;
+}
+
+const MAX_AUDIO_SIZE_MB = 25;
+const MAX_RECORDING_DURATION_SECONDS = 600; // 10 minutes
+
+export const useAudioRecorder = ({ onTranscription, onError }: UseAudioRecorderOptions = {}) => {
+ const [isRecording, setIsRecording] = useState(false);
+ const [isTranscribing, setIsTranscribing] = useState(false);
+ const [recordingDuration, setRecordingDuration] = useState(0);
+ const [estimatedSize, setEstimatedSize] = useState(0);
+ const [isEnabled, setIsEnabled] = useState(false);
+ const [provider, setProvider] = useState(null);
+
+ const { read } = useConfig();
+
+ const mediaRecorderRef = useRef(null);
+ const audioChunksRef = useRef([]);
+ const streamRef = useRef(null);
+ const durationIntervalRef = useRef | null>(null);
+
+ // Check provider configuration on mount
+ useEffect(() => {
+ const checkProviderConfig = async () => {
+ try {
+ // Read provider preference from backend config
+ const providerValue = await read('voice_dictation_provider', false);
+ const preferredProvider = (providerValue as DictationProvider) || null;
+ console.log('[useAudioRecorder] Read voice_dictation_provider:', preferredProvider);
+
+ // If no provider selected, dictation is disabled
+ if (!preferredProvider) {
+ console.log('[useAudioRecorder] No provider selected, setting to null');
+ setIsEnabled(false);
+ setProvider(null);
+ return;
+ }
+
+ // Check backend audio config to see if provider is actually configured (has API key)
+ const audioConfigResponse = await getDictationConfig();
+ const providerStatus = audioConfigResponse.data?.[preferredProvider];
+ console.log(
+ '[useAudioRecorder] Provider status for',
+ preferredProvider,
+ ':',
+ providerStatus
+ );
+
+ if (providerStatus?.configured) {
+ console.log('[useAudioRecorder] Provider is configured, enabling');
+ setIsEnabled(true);
+ setProvider(preferredProvider);
+ } else {
+ console.log('[useAudioRecorder] Provider not configured, disabling but keeping provider');
+ setIsEnabled(false);
+ setProvider(preferredProvider);
+ }
+ } catch (error) {
+ console.error('Error checking audio config:', error);
+ setIsEnabled(false);
+ setProvider(null);
+ }
+ };
+
+ checkProviderConfig();
+ }, [read]);
+
+ const stopRecording = useCallback(() => {
+ setIsRecording(false);
+
+ if (mediaRecorderRef.current && mediaRecorderRef.current.state !== 'inactive') {
+ mediaRecorderRef.current.stop();
+ }
+
+ if (durationIntervalRef.current) {
+ clearInterval(durationIntervalRef.current);
+ durationIntervalRef.current = null;
+ }
+
+ if (streamRef.current) {
+ streamRef.current.getTracks().forEach((track) => track.stop());
+ streamRef.current = null;
+ }
+ }, []);
+
+ // Cleanup on unmount
+ useEffect(() => {
+ return () => {
+ if (durationIntervalRef.current) {
+ clearInterval(durationIntervalRef.current);
+ }
+ if (streamRef.current) {
+ streamRef.current.getTracks().forEach((track) => track.stop());
+ }
+ };
+ }, []);
+
+ const transcribeAudio = useCallback(
+ async (audioBlob: Blob) => {
+ if (!provider) {
+ onError?.(new Error('No transcription provider configured'));
+ return;
+ }
+
+ setIsTranscribing(true);
+
+ try {
+ // Check file size
+ const sizeMB = audioBlob.size / (1024 * 1024);
+ if (sizeMB > MAX_AUDIO_SIZE_MB) {
+ throw new Error(
+ `Audio file too large (${sizeMB.toFixed(1)}MB). Maximum size is ${MAX_AUDIO_SIZE_MB}MB.`
+ );
+ }
+
+ // Convert to base64
+ const reader = new FileReader();
+ const base64Audio = await new Promise((resolve, reject) => {
+ reader.onloadend = () => {
+ const base64 = reader.result as string;
+ resolve(base64.split(',')[1]);
+ };
+ reader.onerror = reject;
+ reader.readAsDataURL(audioBlob);
+ });
+
+ const mimeType = audioBlob.type;
+ if (!mimeType) {
+ throw new Error('Unable to determine audio format');
+ }
+
+ // Transcribe using generated API
+ const result = await transcribeDictation({
+ body: {
+ audio: base64Audio,
+ mime_type: mimeType,
+ provider: provider,
+ },
+ throwOnError: true,
+ });
+
+ if (result.data?.text) {
+ onTranscription?.(result.data.text);
+ }
+ } catch (error) {
+ console.error('Error transcribing audio:', error);
+ onError?.(error as Error);
+ } finally {
+ setIsTranscribing(false);
+ setRecordingDuration(0);
+ setEstimatedSize(0);
+ }
+ },
+ [provider, onTranscription, onError]
+ );
+
+ const startRecording = useCallback(async () => {
+ if (!isEnabled) {
+ onError?.(new Error('Voice dictation is not enabled'));
+ return;
+ }
+
+ try {
+ const stream = await navigator.mediaDevices.getUserMedia({
+ audio: {
+ echoCancellation: true,
+ noiseSuppression: true,
+ autoGainControl: true,
+ },
+ });
+ streamRef.current = stream;
+
+ // Determine best supported MIME type
+ const supportedTypes = ['audio/webm;codecs=opus', 'audio/webm', 'audio/mp4', 'audio/wav'];
+ const mimeType = supportedTypes.find((type) => MediaRecorder.isTypeSupported(type)) || '';
+
+ const mediaRecorder = new MediaRecorder(stream, mimeType ? { mimeType } : {});
+ mediaRecorderRef.current = mediaRecorder;
+ audioChunksRef.current = [];
+
+ // Track recording duration and size
+ const startTime = Date.now();
+ durationIntervalRef.current = setInterval(() => {
+ const elapsed = (Date.now() - startTime) / 1000;
+ setRecordingDuration(elapsed);
+
+ // Estimate size based on typical webm bitrate (~128kbps)
+ const estimatedSizeMB = (elapsed * 128 * 1024) / (8 * 1024 * 1024);
+ setEstimatedSize(estimatedSizeMB);
+
+ // Auto-stop at max duration
+ if (elapsed >= MAX_RECORDING_DURATION_SECONDS) {
+ stopRecording();
+ onError?.(
+ new Error(
+ `Maximum recording duration (${MAX_RECORDING_DURATION_SECONDS / 60} minutes) reached`
+ )
+ );
+ }
+ }, 100);
+
+ mediaRecorder.ondataavailable = (event) => {
+ if (event.data.size > 0) {
+ audioChunksRef.current.push(event.data);
+ }
+ };
+
+ mediaRecorder.onstop = async () => {
+ const audioBlob = new Blob(audioChunksRef.current, { type: mimeType || 'audio/webm' });
+
+ if (audioBlob.size === 0) {
+ onError?.(new Error('No audio data was recorded. Please check your microphone.'));
+ return;
+ }
+
+ await transcribeAudio(audioBlob);
+ };
+
+ mediaRecorder.onerror = (event) => {
+ console.error('MediaRecorder error:', event);
+ onError?.(new Error('Recording failed'));
+ };
+
+ mediaRecorder.start(100);
+ setIsRecording(true);
+ } catch (error) {
+ console.error('Error starting recording:', error);
+ stopRecording();
+ onError?.(error as Error);
+ }
+ }, [isEnabled, onError, transcribeAudio, stopRecording]);
+
+ return {
+ isEnabled,
+ dictationProvider: provider,
+ isRecording,
+ isTranscribing,
+ recordingDuration,
+ estimatedSize,
+ startRecording,
+ stopRecording,
+ };
+};
diff --git a/ui/desktop/src/hooks/useDictationSettings.ts b/ui/desktop/src/hooks/useDictationSettings.ts
deleted file mode 100644
index 4afe4e0f2574..000000000000
--- a/ui/desktop/src/hooks/useDictationSettings.ts
+++ /dev/null
@@ -1,76 +0,0 @@
-import { useState, useEffect } from 'react';
-import { useConfig } from '../components/ConfigContext';
-import {
- DICTATION_SETTINGS_KEY,
- ELEVENLABS_API_KEY,
- DICTATION_PROVIDER_OPENAI,
- DICTATION_PROVIDER_ELEVENLABS,
- getDefaultDictationSettings,
- isSecretKeyConfigured,
-} from './dictationConstants';
-
-export type DictationProvider =
- | typeof DICTATION_PROVIDER_OPENAI
- | typeof DICTATION_PROVIDER_ELEVENLABS
- | null;
-
-export interface DictationSettings {
- enabled: boolean;
- provider: DictationProvider;
-}
-
-let elevenLabsKeyCache: boolean | null = null;
-
-export const setElevenLabsKeyCache = (value: boolean) => {
- elevenLabsKeyCache = value;
-};
-
-export const useDictationSettings = () => {
- const [settings, setSettings] = useState(null);
- const [hasElevenLabsKey, setHasElevenLabsKey] = useState(elevenLabsKeyCache ?? false);
- const { read, getProviders } = useConfig();
-
- useEffect(() => {
- const loadSettings = async () => {
- const saved = localStorage.getItem(DICTATION_SETTINGS_KEY);
-
- let currentSettings: DictationSettings;
- if (saved) {
- currentSettings = JSON.parse(saved);
- } else {
- currentSettings = await getDefaultDictationSettings(getProviders);
- }
- setSettings(currentSettings);
- if (
- currentSettings.provider === DICTATION_PROVIDER_ELEVENLABS &&
- elevenLabsKeyCache === null
- ) {
- try {
- const response = await read(ELEVENLABS_API_KEY, true);
- const hasKey = isSecretKeyConfigured(response);
- elevenLabsKeyCache = hasKey;
- setHasElevenLabsKey(hasKey);
- } catch (error) {
- elevenLabsKeyCache = false;
- setHasElevenLabsKey(false);
- console.error('[useDictationSettings] Error checking ElevenLabs API key:', error);
- }
- }
- };
-
- loadSettings();
-
- // Listen for storage changes from other tabs/windows
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
- const handleStorageChange = (e: any) => {
- if (e.key === DICTATION_SETTINGS_KEY && e.newValue) {
- setSettings(JSON.parse(e.newValue));
- }
- };
-
- window.addEventListener('storage', handleStorageChange);
- return () => window.removeEventListener('storage', handleStorageChange);
- }, [read, getProviders]);
-
- return { settings, hasElevenLabsKey };
-};
diff --git a/ui/desktop/src/hooks/useWhisper.ts b/ui/desktop/src/hooks/useWhisper.ts
deleted file mode 100644
index 09c712c38aa5..000000000000
--- a/ui/desktop/src/hooks/useWhisper.ts
+++ /dev/null
@@ -1,378 +0,0 @@
-import { useState, useRef, useCallback, useEffect } from 'react';
-import { useConfig } from '../components/ConfigContext';
-import { getApiUrl } from '../config';
-import { useDictationSettings } from './useDictationSettings';
-import { DICTATION_PROVIDER_OPENAI, DICTATION_PROVIDER_ELEVENLABS } from './dictationConstants';
-import { safeJsonParse, errorMessage } from '../utils/conversionUtils';
-
-interface UseWhisperOptions {
- onTranscription?: (text: string) => void;
- onError?: (error: Error) => void;
- onSizeWarning?: (sizeInMB: number) => void;
-}
-
-// Constants
-const MAX_AUDIO_SIZE_MB = 25;
-const MAX_RECORDING_DURATION_SECONDS = 600; // 10 minutes
-const WARNING_SIZE_MB = 20; // Warn at 20MB
-
-export const useWhisper = ({ onTranscription, onError, onSizeWarning }: UseWhisperOptions = {}) => {
- const [isRecording, setIsRecording] = useState(false);
- const [isTranscribing, setIsTranscribing] = useState(false);
- const [hasOpenAIKey, setHasOpenAIKey] = useState(false);
- const [canUseDictation, setCanUseDictation] = useState(false);
- const [audioContext, setAudioContext] = useState(null);
- const [analyser, setAnalyser] = useState(null);
- const [recordingDuration, setRecordingDuration] = useState(0);
- const [estimatedSize, setEstimatedSize] = useState(0);
-
- const mediaRecorderRef = useRef(null);
- const audioChunksRef = useRef([]);
- const streamRef = useRef(null);
- const recordingStartTimeRef = useRef(null);
- const durationIntervalRef = useRef | null>(null);
- const currentSizeRef = useRef(0);
-
- const { getProviders } = useConfig();
- const { settings: dictationSettings, hasElevenLabsKey } = useDictationSettings();
-
- // Check if OpenAI API key is configured (regardless of current provider)
- useEffect(() => {
- const checkOpenAIKey = async () => {
- try {
- // Get all configured providers
- const providers = await getProviders(false);
-
- // Find OpenAI provider
- const openAIProvider = providers.find((p) => p.name === 'openai');
-
- // Check if OpenAI is configured
- if (openAIProvider && openAIProvider.is_configured) {
- setHasOpenAIKey(true);
- } else {
- setHasOpenAIKey(false);
- }
- } catch (error) {
- console.error('Error checking OpenAI configuration:', error);
- setHasOpenAIKey(false);
- }
- };
-
- checkOpenAIKey();
- }, [getProviders]); // Re-check when providers change
-
- // Check if dictation can be used based on settings
- useEffect(() => {
- if (!dictationSettings) {
- setCanUseDictation(false);
- return;
- }
-
- if (!dictationSettings.enabled) {
- setCanUseDictation(false);
- return;
- }
-
- // Check provider availability
- switch (dictationSettings.provider) {
- case DICTATION_PROVIDER_OPENAI:
- setCanUseDictation(hasOpenAIKey);
- break;
- case DICTATION_PROVIDER_ELEVENLABS:
- setCanUseDictation(hasElevenLabsKey);
- break;
- default:
- setCanUseDictation(false);
- }
- }, [dictationSettings, hasOpenAIKey, hasElevenLabsKey]);
-
- // Define stopRecording before startRecording to avoid circular dependency
- const stopRecording = useCallback(() => {
- setIsRecording(false);
-
- if (mediaRecorderRef.current && mediaRecorderRef.current.state !== 'inactive') {
- mediaRecorderRef.current.stop();
- }
-
- // Clear interval
- if (durationIntervalRef.current) {
- clearInterval(durationIntervalRef.current);
- durationIntervalRef.current = null;
- }
-
- // Stop all tracks in the stream
- if (streamRef.current) {
- streamRef.current.getTracks().forEach((track) => track.stop());
- streamRef.current = null;
- }
-
- // Close audio context
- if (audioContext && audioContext.state !== 'closed') {
- audioContext.close().catch(console.error);
- setAudioContext(null);
- setAnalyser(null);
- }
- }, [audioContext]);
-
- // Cleanup effect to prevent memory leaks
- useEffect(() => {
- return () => {
- // Cleanup on unmount
- if (durationIntervalRef.current) {
- clearInterval(durationIntervalRef.current);
- }
- if (streamRef.current) {
- streamRef.current.getTracks().forEach((track) => track.stop());
- }
- if (audioContext && audioContext.state !== 'closed') {
- audioContext.close().catch(console.error);
- }
- };
- }, [audioContext]);
-
- const transcribeAudio = useCallback(
- async (audioBlob: Blob) => {
- if (!dictationSettings) {
- stopRecording();
- onError?.(new Error('Dictation settings not loaded'));
- return;
- }
-
- setIsTranscribing(true);
-
- try {
- // Check final size
- const sizeMB = audioBlob.size / (1024 * 1024);
- if (sizeMB > MAX_AUDIO_SIZE_MB) {
- throw new Error(
- `Audio file too large (${sizeMB.toFixed(1)}MB). Maximum size is ${MAX_AUDIO_SIZE_MB}MB.`
- );
- }
-
- // Convert blob to base64 for easier transport
- const reader = new FileReader();
- const base64Audio = await new Promise((resolve, reject) => {
- reader.onloadend = () => {
- const base64 = reader.result as string;
- resolve(base64.split(',')[1]); // Remove data:audio/webm;base64, prefix
- };
- reader.onerror = reject;
- reader.readAsDataURL(audioBlob);
- });
-
- const mimeType = audioBlob.type;
- if (!mimeType) {
- throw new Error('Unable to determine audio format. Please try again.');
- }
-
- let endpoint = '';
- let headers: Record = {
- 'Content-Type': 'application/json',
- 'X-Secret-Key': await window.electron.getSecretKey(),
- };
-
- let body: Record = {
- audio: base64Audio,
- mime_type: mimeType,
- };
-
- // Choose endpoint based on provider
- switch (dictationSettings.provider) {
- case DICTATION_PROVIDER_OPENAI:
- endpoint = '/audio/transcribe';
- break;
- case DICTATION_PROVIDER_ELEVENLABS:
- endpoint = '/audio/transcribe/elevenlabs';
- break;
- default:
- throw new Error(`Unsupported provider: ${dictationSettings.provider}`);
- }
-
- const response = await fetch(getApiUrl(endpoint), {
- method: 'POST',
- headers,
- body: JSON.stringify(body),
- });
-
- if (!response.ok) {
- if (response.status === 404) {
- throw new Error(
- `Audio transcription endpoint not found. Please implement ${endpoint} endpoint in the Goose backend.`
- );
- } else if (response.status === 401) {
- throw new Error('Invalid API key. Please check your API key is correct.');
- } else if (response.status === 402) {
- throw new Error('API quota exceeded. Please check your account limits.');
- }
- const errorData = await safeJsonParse<{
- error: { message: string };
- }>(response, 'Failed to parse error response').catch(() => ({
- error: { message: 'Transcription failed' },
- }));
- throw new Error(errorData.error?.message || 'Transcription failed');
- }
-
- const data = await safeJsonParse<{ text: string }>(
- response,
- 'Failed to parse transcription response'
- );
- if (data.text) {
- onTranscription?.(data.text);
- }
- } catch (error) {
- console.error('Error transcribing audio:', error);
- stopRecording();
- onError?.(error as Error);
- } finally {
- setIsTranscribing(false);
- setRecordingDuration(0);
- setEstimatedSize(0);
- }
- },
- [onTranscription, onError, dictationSettings, stopRecording]
- );
-
- const startRecording = useCallback(async () => {
- if (!dictationSettings) {
- stopRecording();
- onError?.(new Error('Dictation settings not loaded'));
- return;
- }
-
- try {
- // Request microphone permission
- const stream = await navigator.mediaDevices.getUserMedia({
- audio: {
- echoCancellation: true,
- noiseSuppression: true,
- autoGainControl: true,
- sampleRate: 44100,
- },
- });
- streamRef.current = stream;
-
- // Verify we have valid audio tracks
- const audioTracks = stream.getAudioTracks();
- if (audioTracks.length === 0) {
- throw new Error('No audio tracks available in the microphone stream');
- }
-
- // AudioContext creation is disabled to prevent MediaRecorder conflicts
- setAudioContext(null);
- setAnalyser(null);
-
- // Determine best supported MIME type
- const supportedTypes = ['audio/webm;codecs=opus', 'audio/webm', 'audio/mp4', 'audio/wav'];
-
- const mimeType = supportedTypes.find((type) => MediaRecorder.isTypeSupported(type)) || '';
-
- const mediaRecorder = new MediaRecorder(stream, mimeType ? { mimeType } : {});
-
- mediaRecorderRef.current = mediaRecorder;
- audioChunksRef.current = [];
- currentSizeRef.current = 0;
- recordingStartTimeRef.current = Date.now();
-
- // Start duration and size tracking
- durationIntervalRef.current = setInterval(() => {
- const elapsed = (Date.now() - (recordingStartTimeRef.current || 0)) / 1000;
- setRecordingDuration(elapsed);
-
- // Estimate size based on typical webm bitrate (~128kbps)
- const estimatedSizeMB = (elapsed * 128 * 1024) / (8 * 1024 * 1024);
- setEstimatedSize(estimatedSizeMB);
-
- // Check if we're approaching the limit
- if (estimatedSizeMB > WARNING_SIZE_MB) {
- onSizeWarning?.(estimatedSizeMB);
- }
-
- // Auto-stop if we hit the duration limit
- if (elapsed >= MAX_RECORDING_DURATION_SECONDS) {
- stopRecording();
- onError?.(
- new Error(
- `Maximum recording duration (${MAX_RECORDING_DURATION_SECONDS / 60} minutes) reached.`
- )
- );
- }
- }, 100);
-
- mediaRecorder.ondataavailable = (event) => {
- if (event.data.size > 0) {
- audioChunksRef.current.push(event.data);
- currentSizeRef.current += event.data.size;
-
- // Check actual size
- const actualSizeMB = currentSizeRef.current / (1024 * 1024);
- if (actualSizeMB > MAX_AUDIO_SIZE_MB) {
- stopRecording();
- onError?.(new Error(`Maximum file size (${MAX_AUDIO_SIZE_MB}MB) reached.`));
- }
- }
- };
-
- mediaRecorder.onstop = async () => {
- const audioBlob = new Blob(audioChunksRef.current, { type: mimeType || 'audio/webm' });
-
- // Check if the blob is empty
- if (audioBlob.size === 0) {
- onError?.(
- new Error(
- 'No audio data was recorded. Please check your microphone permissions and try again.'
- )
- );
- return;
- }
-
- await transcribeAudio(audioBlob);
- };
-
- // Add error handler for MediaRecorder
- mediaRecorder.onerror = (event) => {
- console.error('MediaRecorder error:', event);
- onError?.(new Error('Recording failed: Unknown error'));
- };
-
- if (!stream.active) {
- throw new Error('Audio stream became inactive before recording could start');
- }
-
- // Check audio tracks again before starting recording
- if (audioTracks.length === 0) {
- throw new Error('No audio tracks available in the stream');
- }
-
- const activeAudioTracks = audioTracks.filter((track) => track.readyState === 'live');
- if (activeAudioTracks.length === 0) {
- throw new Error('No live audio tracks available');
- }
-
- try {
- mediaRecorder.start(100);
- setIsRecording(true);
- } catch (startError) {
- console.error('Error calling mediaRecorder.start():', startError);
- throw new Error(`Failed to start recording: ${errorMessage(startError)}`);
- }
- } catch (error) {
- console.error('Error starting recording:', error);
- stopRecording();
- onError?.(error as Error);
- }
- }, [onError, onSizeWarning, transcribeAudio, stopRecording, dictationSettings]);
-
- return {
- isRecording,
- isTranscribing,
- hasOpenAIKey,
- canUseDictation,
- audioContext,
- analyser,
- startRecording,
- stopRecording,
- recordingDuration,
- estimatedSize,
- dictationSettings,
- };
-};
diff --git a/ui/desktop/src/updates.ts b/ui/desktop/src/updates.ts
index 756e73682c3e..14e49fc10a07 100644
--- a/ui/desktop/src/updates.ts
+++ b/ui/desktop/src/updates.ts
@@ -2,5 +2,4 @@ export const UPDATES_ENABLED = true;
export const COST_TRACKING_ENABLED = true;
export const ANNOUNCEMENTS_ENABLED = false;
export const CONFIGURATION_ENABLED = true;
-export const VOICE_DICTATION_ELEVENLABS_ENABLED = true;
export const TELEMETRY_UI_ENABLED = true;
From 1a3afddc99c2fac86488b7dbd22c68e924feeeea Mon Sep 17 00:00:00 2001
From: Douwe Osinga
Date: Fri, 30 Jan 2026 12:33:54 -0500
Subject: [PATCH 2/4] Manual clean up
---
.../settings/dictation/DictationSettings.tsx | 34 ++++---------
ui/desktop/src/hooks/useAudioRecorder.ts | 51 +++++--------------
2 files changed, 24 insertions(+), 61 deletions(-)
diff --git a/ui/desktop/src/components/settings/dictation/DictationSettings.tsx b/ui/desktop/src/components/settings/dictation/DictationSettings.tsx
index 482fcb31f5c7..39774cca20da 100644
--- a/ui/desktop/src/components/settings/dictation/DictationSettings.tsx
+++ b/ui/desktop/src/components/settings/dictation/DictationSettings.tsx
@@ -19,18 +19,12 @@ export const DictationSettings = () => {
useEffect(() => {
const loadSettings = async () => {
- try {
- const providerValue = await read('voice_dictation_provider', false);
- const loadedProvider: DictationProvider | null =
- (providerValue as DictationProvider) || null;
- setProvider(loadedProvider);
-
- // Get provider configuration status from backend
- const audioConfig = await getDictationConfig();
- setProviderStatuses(audioConfig.data || {});
- } catch (error) {
- console.error('Error loading dictation settings:', error);
- }
+ const providerValue = await read('voice_dictation_provider', false);
+ const loadedProvider: DictationProvider | null = (providerValue as DictationProvider) || null;
+ setProvider(loadedProvider);
+
+ const audioConfig = await getDictationConfig();
+ setProviderStatuses(audioConfig.data || {});
};
loadSettings();
@@ -39,12 +33,8 @@ export const DictationSettings = () => {
const saveProvider = async (newProvider: DictationProvider | null) => {
console.log('Saving dictation provider to backend config:', newProvider);
setProvider(newProvider);
- try {
- await upsert('voice_dictation_provider', newProvider || '', false);
- trackSettingToggled('voice_dictation', newProvider !== null);
- } catch (error) {
- console.error('Error saving dictation provider:', error);
- }
+ await upsert('voice_dictation_provider', newProvider || '', false);
+ trackSettingToggled('voice_dictation', newProvider !== null);
};
const handleProviderChange = (newProvider: DictationProvider | null) => {
@@ -57,12 +47,8 @@ export const DictationSettings = () => {
setShowProviderDropdown(newShowState);
if (newShowState) {
- try {
- const audioConfig = await getDictationConfig();
- setProviderStatuses(audioConfig.data || {});
- } catch (error) {
- console.error('Error checking provider configuration:', error);
- }
+ const audioConfig = await getDictationConfig();
+ setProviderStatuses(audioConfig.data || {});
}
};
diff --git a/ui/desktop/src/hooks/useAudioRecorder.ts b/ui/desktop/src/hooks/useAudioRecorder.ts
index 151bcdf74c1b..4411c4ca892a 100644
--- a/ui/desktop/src/hooks/useAudioRecorder.ts
+++ b/ui/desktop/src/hooks/useAudioRecorder.ts
@@ -3,14 +3,14 @@ import { transcribeDictation, getDictationConfig, DictationProvider } from '../a
import { useConfig } from '../components/ConfigContext';
interface UseAudioRecorderOptions {
- onTranscription?: (text: string) => void;
- onError?: (error: Error) => void;
+ onTranscription: (text: string) => void;
+ onError: (error: Error) => void;
}
const MAX_AUDIO_SIZE_MB = 25;
const MAX_RECORDING_DURATION_SECONDS = 600; // 10 minutes
-export const useAudioRecorder = ({ onTranscription, onError }: UseAudioRecorderOptions = {}) => {
+export const useAudioRecorder = ({ onTranscription, onError }: UseAudioRecorderOptions) => {
const [isRecording, setIsRecording] = useState(false);
const [isTranscribing, setIsTranscribing] = useState(false);
const [recordingDuration, setRecordingDuration] = useState(0);
@@ -25,42 +25,23 @@ export const useAudioRecorder = ({ onTranscription, onError }: UseAudioRecorderO
const streamRef = useRef(null);
const durationIntervalRef = useRef | null>(null);
- // Check provider configuration on mount
useEffect(() => {
const checkProviderConfig = async () => {
try {
- // Read provider preference from backend config
const providerValue = await read('voice_dictation_provider', false);
const preferredProvider = (providerValue as DictationProvider) || null;
- console.log('[useAudioRecorder] Read voice_dictation_provider:', preferredProvider);
- // If no provider selected, dictation is disabled
if (!preferredProvider) {
- console.log('[useAudioRecorder] No provider selected, setting to null');
setIsEnabled(false);
setProvider(null);
return;
}
- // Check backend audio config to see if provider is actually configured (has API key)
const audioConfigResponse = await getDictationConfig();
const providerStatus = audioConfigResponse.data?.[preferredProvider];
- console.log(
- '[useAudioRecorder] Provider status for',
- preferredProvider,
- ':',
- providerStatus
- );
-
- if (providerStatus?.configured) {
- console.log('[useAudioRecorder] Provider is configured, enabling');
- setIsEnabled(true);
- setProvider(preferredProvider);
- } else {
- console.log('[useAudioRecorder] Provider not configured, disabling but keeping provider');
- setIsEnabled(false);
- setProvider(preferredProvider);
- }
+
+ setIsEnabled(!!providerStatus?.configured);
+ setProvider(preferredProvider);
} catch (error) {
console.error('Error checking audio config:', error);
setIsEnabled(false);
@@ -89,7 +70,6 @@ export const useAudioRecorder = ({ onTranscription, onError }: UseAudioRecorderO
}
}, []);
- // Cleanup on unmount
useEffect(() => {
return () => {
if (durationIntervalRef.current) {
@@ -104,14 +84,13 @@ export const useAudioRecorder = ({ onTranscription, onError }: UseAudioRecorderO
const transcribeAudio = useCallback(
async (audioBlob: Blob) => {
if (!provider) {
- onError?.(new Error('No transcription provider configured'));
+ onError(new Error('No transcription provider configured'));
return;
}
setIsTranscribing(true);
try {
- // Check file size
const sizeMB = audioBlob.size / (1024 * 1024);
if (sizeMB > MAX_AUDIO_SIZE_MB) {
throw new Error(
@@ -119,7 +98,6 @@ export const useAudioRecorder = ({ onTranscription, onError }: UseAudioRecorderO
);
}
- // Convert to base64
const reader = new FileReader();
const base64Audio = await new Promise((resolve, reject) => {
reader.onloadend = () => {
@@ -135,7 +113,6 @@ export const useAudioRecorder = ({ onTranscription, onError }: UseAudioRecorderO
throw new Error('Unable to determine audio format');
}
- // Transcribe using generated API
const result = await transcribeDictation({
body: {
audio: base64Audio,
@@ -146,11 +123,11 @@ export const useAudioRecorder = ({ onTranscription, onError }: UseAudioRecorderO
});
if (result.data?.text) {
- onTranscription?.(result.data.text);
+ onTranscription(result.data.text);
}
} catch (error) {
console.error('Error transcribing audio:', error);
- onError?.(error as Error);
+ onError(error as Error);
} finally {
setIsTranscribing(false);
setRecordingDuration(0);
@@ -162,7 +139,7 @@ export const useAudioRecorder = ({ onTranscription, onError }: UseAudioRecorderO
const startRecording = useCallback(async () => {
if (!isEnabled) {
- onError?.(new Error('Voice dictation is not enabled'));
+ onError(new Error('Voice dictation is not enabled'));
return;
}
@@ -197,7 +174,7 @@ export const useAudioRecorder = ({ onTranscription, onError }: UseAudioRecorderO
// Auto-stop at max duration
if (elapsed >= MAX_RECORDING_DURATION_SECONDS) {
stopRecording();
- onError?.(
+ onError(
new Error(
`Maximum recording duration (${MAX_RECORDING_DURATION_SECONDS / 60} minutes) reached`
)
@@ -215,7 +192,7 @@ export const useAudioRecorder = ({ onTranscription, onError }: UseAudioRecorderO
const audioBlob = new Blob(audioChunksRef.current, { type: mimeType || 'audio/webm' });
if (audioBlob.size === 0) {
- onError?.(new Error('No audio data was recorded. Please check your microphone.'));
+ onError(new Error('No audio data was recorded. Please check your microphone.'));
return;
}
@@ -224,7 +201,7 @@ export const useAudioRecorder = ({ onTranscription, onError }: UseAudioRecorderO
mediaRecorder.onerror = (event) => {
console.error('MediaRecorder error:', event);
- onError?.(new Error('Recording failed'));
+ onError(new Error('Recording failed'));
};
mediaRecorder.start(100);
@@ -232,7 +209,7 @@ export const useAudioRecorder = ({ onTranscription, onError }: UseAudioRecorderO
} catch (error) {
console.error('Error starting recording:', error);
stopRecording();
- onError?.(error as Error);
+ onError(error as Error);
}
}, [isEnabled, onError, transcribeAudio, stopRecording]);
From 4865f8b0a054b480ba7e4b3d535a51af0a3309d2 Mon Sep 17 00:00:00 2001
From: Douwe Osinga
Date: Fri, 30 Jan 2026 13:54:26 -0500
Subject: [PATCH 3/4] Simplify further
---
crates/goose-server/src/routes/dictation.rs | 186 ++++++++------------
crates/goose/src/providers/api_client.rs | 5 +
ui/desktop/src/components/ChatInput.tsx | 11 +-
ui/desktop/src/hooks/useAudioRecorder.ts | 33 ++--
4 files changed, 95 insertions(+), 140 deletions(-)
diff --git a/crates/goose-server/src/routes/dictation.rs b/crates/goose-server/src/routes/dictation.rs
index d9658c7f01a4..66f53cd28c25 100644
--- a/crates/goose-server/src/routes/dictation.rs
+++ b/crates/goose-server/src/routes/dictation.rs
@@ -6,7 +6,7 @@ use axum::{
Json, Router,
};
use base64::{engine::general_purpose::STANDARD as BASE64, Engine};
-use reqwest::Client;
+use goose::providers::api_client::{ApiClient, AuthMethod};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::sync::Arc;
@@ -141,54 +141,68 @@ fn validate_audio(audio: &str, mime_type: &str) -> Result<(Vec, &'static str
Ok((audio_bytes, extension))
}
-fn get_provider_config(provider: &str) -> Result<(String, String), ErrorResponse> {
+async fn handle_response_error(response: reqwest::Response) -> ErrorResponse {
+ let status = response.status();
+ let error_text = response.text().await.unwrap_or_default();
+
+ ErrorResponse {
+ message: if status == 401 || error_text.contains("Invalid API key") || error_text.contains("Unauthorized") {
+ "Invalid API key".to_string()
+ } else if status == 429 || error_text.contains("quota") || error_text.contains("limit") {
+ "Rate limit exceeded".to_string()
+ } else {
+ format!("API error: {}", error_text)
+ },
+ status: if status.is_client_error() { status } else { StatusCode::BAD_GATEWAY },
+ }
+}
+
+fn build_api_client(provider: &str) -> Result {
let config = goose::config::Config::global();
let def = get_provider_def(provider)
.ok_or_else(|| ErrorResponse::bad_request(format!("Unknown provider: {}", provider)))?;
- let api_key = config
- .get_secret(def.config_key)
- .map_err(|_| ErrorResponse {
- message: format!("{} not configured", def.config_key),
- status: StatusCode::PRECONDITION_FAILED,
- })?;
+ let api_key = config.get_secret(def.config_key).map_err(|_| ErrorResponse {
+ message: format!("{} not configured", def.config_key),
+ status: StatusCode::PRECONDITION_FAILED,
+ })?;
let url = if let Some(host_key) = def.host_key {
- // If host_key is configured, replace the host part of the default URL
- if let Some(custom_host) = config
+ config
.get(host_key, false)
.ok()
.and_then(|v| v.as_str().map(|s| s.to_string()))
- {
- // Extract the path from default_url (everything after the third slash)
- // e.g., "https://api.openai.com/v1/audio/transcriptions" -> "/v1/audio/transcriptions"
- let path = def
- .default_url
- .splitn(4, '/')
- .nth(3)
- .map(|p| format!("/{}", p))
- .unwrap_or_else(|| "".to_string());
-
- // Remove trailing slash from custom host if present
- let custom_host = custom_host.trim_end_matches('/');
-
- format!("{}{}", custom_host, path)
- } else {
- def.default_url.to_string()
- }
+ .map(|custom_host| {
+ let path = def.default_url
+ .splitn(4, '/')
+ .nth(3)
+ .map(|p| format!("/{}", p))
+ .unwrap_or_default();
+ format!("{}{}", custom_host.trim_end_matches('/'), path)
+ })
+ .unwrap_or_else(|| def.default_url.to_string())
} else {
def.default_url.to_string()
};
- Ok((api_key, url))
+ let auth = match provider {
+ "openai" => AuthMethod::BearerToken(api_key),
+ "elevenlabs" => AuthMethod::ApiKey {
+ header_name: "xi-api-key".to_string(),
+ key: api_key,
+ },
+ _ => return Err(ErrorResponse::bad_request(format!("Unknown provider: {}", provider))),
+ };
+
+ ApiClient::with_timeout(url, auth, REQUEST_TIMEOUT)
+ .map_err(|e| ErrorResponse::internal(format!("Failed to create client: {}", e)))
}
async fn transcribe_openai(
audio_bytes: Vec,
extension: &str,
mime_type: &str,
- api_key: &str,
- url: &str,
+ client: &ApiClient,
) -> Result {
let part = reqwest::multipart::Part::bytes(audio_bytes)
.file_name(format!("audio.{}", extension))
@@ -199,49 +213,25 @@ async fn transcribe_openai(
.part("file", part)
.text("model", "whisper-1");
- let client = Client::builder()
- .timeout(REQUEST_TIMEOUT)
- .build()
- .map_err(|e| ErrorResponse::internal(format!("Failed to create client: {}", e)))?;
-
let response = client
- .post(url)
- .header("Authorization", format!("Bearer {}", api_key))
- .multipart(form)
- .send()
+ .request(None, "")
+ .multipart_post(form)
.await
- .map_err(|e| {
- if e.is_timeout() {
- ErrorResponse {
- message: "Request timed out".to_string(),
- status: StatusCode::GATEWAY_TIMEOUT,
- }
+ .map_err(|e| ErrorResponse {
+ message: if e.to_string().contains("timeout") {
+ "Request timed out".to_string()
} else {
- ErrorResponse {
- message: format!("Request failed: {}", e),
- status: StatusCode::SERVICE_UNAVAILABLE,
- }
- }
- })?;
-
- if !response.status().is_success() {
- let status = response.status();
- let error_text = response.text().await.unwrap_or_default();
-
- return Err(ErrorResponse {
- message: if status == 401 {
- "Invalid API key".to_string()
- } else if status == 429 {
- "Rate limit exceeded".to_string()
- } else {
- format!("API error: {}", error_text)
+ format!("Request failed: {}", e)
},
- status: if status.is_client_error() {
- status
+ status: if e.to_string().contains("timeout") {
+ StatusCode::GATEWAY_TIMEOUT
} else {
- StatusCode::BAD_GATEWAY
+ StatusCode::SERVICE_UNAVAILABLE
},
- });
+ })?;
+
+ if !response.status().is_success() {
+ return Err(handle_response_error(response).await);
}
let data: TranscribeResponse = response
@@ -256,8 +246,7 @@ async fn transcribe_elevenlabs(
audio_bytes: Vec,
extension: &str,
mime_type: &str,
- api_key: &str,
- url: &str,
+ client: &ApiClient,
) -> Result {
let part = reqwest::multipart::Part::bytes(audio_bytes)
.file_name(format!("audio.{}", extension))
@@ -268,51 +257,25 @@ async fn transcribe_elevenlabs(
.part("file", part)
.text("model_id", "scribe_v1");
- let client = Client::builder()
- .timeout(REQUEST_TIMEOUT)
- .build()
- .map_err(|e| ErrorResponse::internal(format!("Failed to create client: {}", e)))?;
-
let response = client
- .post(url)
- .header("xi-api-key", api_key)
- .multipart(form)
- .send()
+ .request(None, "")
+ .multipart_post(form)
.await
- .map_err(|e| {
- if e.is_timeout() {
- ErrorResponse {
- message: "Request timed out".to_string(),
- status: StatusCode::GATEWAY_TIMEOUT,
- }
+ .map_err(|e| ErrorResponse {
+ message: if e.to_string().contains("timeout") {
+ "Request timed out".to_string()
} else {
- ErrorResponse {
- message: format!("Request failed: {}", e),
- status: StatusCode::SERVICE_UNAVAILABLE,
- }
- }
- })?;
-
- if !response.status().is_success() {
- let status = response.status();
- let error_text = response.text().await.unwrap_or_default();
-
- return Err(ErrorResponse {
- message: if error_text.contains("Unauthorized")
- || error_text.contains("Invalid API key")
- {
- "Invalid API key".to_string()
- } else if error_text.contains("quota") || error_text.contains("limit") {
- "Rate limit exceeded".to_string()
- } else {
- format!("API error: {}", error_text)
+ format!("Request failed: {}", e)
},
- status: if status.is_client_error() {
- status
+ status: if e.to_string().contains("timeout") {
+ StatusCode::GATEWAY_TIMEOUT
} else {
- StatusCode::BAD_GATEWAY
+ StatusCode::SERVICE_UNAVAILABLE
},
- });
+ })?;
+
+ if !response.status().is_success() {
+ return Err(handle_response_error(response).await);
}
let data: TranscribeResponse = response
@@ -345,15 +308,14 @@ pub async fn transcribe_dictation(
) -> Result, ErrorResponse> {
let (audio_bytes, extension) = validate_audio(&request.audio, &request.mime_type)?;
let provider_name = request.provider.as_str();
- let (api_key, url) = get_provider_config(provider_name)?;
+ let client = build_api_client(provider_name)?;
let text = match request.provider {
DictationProvider::OpenAI => {
- transcribe_openai(audio_bytes, extension, &request.mime_type, &api_key, &url).await?
+ transcribe_openai(audio_bytes, extension, &request.mime_type, &client).await?
}
DictationProvider::ElevenLabs => {
- transcribe_elevenlabs(audio_bytes, extension, &request.mime_type, &api_key, &url)
- .await?
+ transcribe_elevenlabs(audio_bytes, extension, &request.mime_type, &client).await?
}
};
diff --git a/crates/goose/src/providers/api_client.rs b/crates/goose/src/providers/api_client.rs
index 627f5435d535..4524f1a9ee36 100644
--- a/crates/goose/src/providers/api_client.rs
+++ b/crates/goose/src/providers/api_client.rs
@@ -365,6 +365,11 @@ impl<'a> ApiRequestBuilder<'a> {
Ok(request.json(payload).send().await?)
}
+ pub async fn multipart_post(self, form: reqwest::multipart::Form) -> Result {
+ let request = self.send_request(|url, client| client.post(url)).await?;
+ Ok(request.multipart(form).send().await?)
+ }
+
pub async fn api_get(self) -> Result {
let response = self.response_get().await?;
ApiResponse::from_response(response).await
diff --git a/ui/desktop/src/components/ChatInput.tsx b/ui/desktop/src/components/ChatInput.tsx
index 4c8e2e99c68d..17bded35f421 100644
--- a/ui/desktop/src/components/ChatInput.tsx
+++ b/ui/desktop/src/components/ChatInput.tsx
@@ -271,12 +271,12 @@ export default function ChatInput({
setValue(newValue);
textAreaRef.current?.focus();
},
- onError: (error) => {
- const errorType = error.name || 'DictationError';
+ onError: (message) => {
+ const errorType = 'DictationError';
trackVoiceDictation('error', undefined, errorType);
toastError({
title: 'Dictation Error',
- msg: error.message,
+ msg: message,
});
},
});
@@ -1272,11 +1272,6 @@ export default function ChatInput({
ElevenLabs API key is not configured. Set it up in Settings {'>'}{' '}
Chat {'>'} Voice Dictation.
- ) : dictationProvider === null ? (
-
- Dictation is not configured. Configure it in Settings {'>'}{' '}
- Chat {'>'} Voice Dictation.
-
) : (
Dictation provider is not properly configured.
)}
diff --git a/ui/desktop/src/hooks/useAudioRecorder.ts b/ui/desktop/src/hooks/useAudioRecorder.ts
index 4411c4ca892a..cfdc43f20678 100644
--- a/ui/desktop/src/hooks/useAudioRecorder.ts
+++ b/ui/desktop/src/hooks/useAudioRecorder.ts
@@ -1,14 +1,15 @@
import { useState, useRef, useCallback, useEffect } from 'react';
import { transcribeDictation, getDictationConfig, DictationProvider } from '../api';
import { useConfig } from '../components/ConfigContext';
+import { errorMessage } from '../utils/conversionUtils';
interface UseAudioRecorderOptions {
onTranscription: (text: string) => void;
- onError: (error: Error) => void;
+ onError: (message: string) => void;
}
const MAX_AUDIO_SIZE_MB = 25;
-const MAX_RECORDING_DURATION_SECONDS = 600; // 10 minutes
+const MAX_RECORDING_DURATION_SECONDS = 10 * 60;
export const useAudioRecorder = ({ onTranscription, onError }: UseAudioRecorderOptions) => {
const [isRecording, setIsRecording] = useState(false);
@@ -84,7 +85,7 @@ export const useAudioRecorder = ({ onTranscription, onError }: UseAudioRecorderO
const transcribeAudio = useCallback(
async (audioBlob: Blob) => {
if (!provider) {
- onError(new Error('No transcription provider configured'));
+ onError('No transcription provider configured');
return;
}
@@ -93,9 +94,10 @@ export const useAudioRecorder = ({ onTranscription, onError }: UseAudioRecorderO
try {
const sizeMB = audioBlob.size / (1024 * 1024);
if (sizeMB > MAX_AUDIO_SIZE_MB) {
- throw new Error(
+ onError(
`Audio file too large (${sizeMB.toFixed(1)}MB). Maximum size is ${MAX_AUDIO_SIZE_MB}MB.`
);
+ return;
}
const reader = new FileReader();
@@ -126,8 +128,7 @@ export const useAudioRecorder = ({ onTranscription, onError }: UseAudioRecorderO
onTranscription(result.data.text);
}
} catch (error) {
- console.error('Error transcribing audio:', error);
- onError(error as Error);
+ onError(errorMessage(error));
} finally {
setIsTranscribing(false);
setRecordingDuration(0);
@@ -139,7 +140,7 @@ export const useAudioRecorder = ({ onTranscription, onError }: UseAudioRecorderO
const startRecording = useCallback(async () => {
if (!isEnabled) {
- onError(new Error('Voice dictation is not enabled'));
+ onError('Voice dictation is not enabled');
return;
}
@@ -153,7 +154,6 @@ export const useAudioRecorder = ({ onTranscription, onError }: UseAudioRecorderO
});
streamRef.current = stream;
- // Determine best supported MIME type
const supportedTypes = ['audio/webm;codecs=opus', 'audio/webm', 'audio/mp4', 'audio/wav'];
const mimeType = supportedTypes.find((type) => MediaRecorder.isTypeSupported(type)) || '';
@@ -161,23 +161,18 @@ export const useAudioRecorder = ({ onTranscription, onError }: UseAudioRecorderO
mediaRecorderRef.current = mediaRecorder;
audioChunksRef.current = [];
- // Track recording duration and size
const startTime = Date.now();
durationIntervalRef.current = setInterval(() => {
const elapsed = (Date.now() - startTime) / 1000;
setRecordingDuration(elapsed);
- // Estimate size based on typical webm bitrate (~128kbps)
const estimatedSizeMB = (elapsed * 128 * 1024) / (8 * 1024 * 1024);
setEstimatedSize(estimatedSizeMB);
- // Auto-stop at max duration
if (elapsed >= MAX_RECORDING_DURATION_SECONDS) {
stopRecording();
onError(
- new Error(
- `Maximum recording duration (${MAX_RECORDING_DURATION_SECONDS / 60} minutes) reached`
- )
+ `Maximum recording duration (${MAX_RECORDING_DURATION_SECONDS / 60} minutes) reached`
);
}
}, 100);
@@ -192,24 +187,22 @@ export const useAudioRecorder = ({ onTranscription, onError }: UseAudioRecorderO
const audioBlob = new Blob(audioChunksRef.current, { type: mimeType || 'audio/webm' });
if (audioBlob.size === 0) {
- onError(new Error('No audio data was recorded. Please check your microphone.'));
+ onError('No audio data was recorded. Please check your microphone.');
return;
}
await transcribeAudio(audioBlob);
};
- mediaRecorder.onerror = (event) => {
- console.error('MediaRecorder error:', event);
- onError(new Error('Recording failed'));
+ mediaRecorder.onerror = (_event) => {
+ onError('Recording failed');
};
mediaRecorder.start(100);
setIsRecording(true);
} catch (error) {
- console.error('Error starting recording:', error);
stopRecording();
- onError(error as Error);
+ onError(errorMessage(error));
}
}, [isEnabled, onError, transcribeAudio, stopRecording]);
From 66d07240cde3e87d56af995d08702e3b0acdc790 Mon Sep 17 00:00:00 2001
From: Douwe Osinga
Date: Fri, 30 Jan 2026 13:57:02 -0500
Subject: [PATCH 4/4] Format
---
crates/goose-server/src/routes/dictation.rs | 31 +++++++++++++++------
1 file changed, 23 insertions(+), 8 deletions(-)
diff --git a/crates/goose-server/src/routes/dictation.rs b/crates/goose-server/src/routes/dictation.rs
index 66f53cd28c25..5112fd0d044f 100644
--- a/crates/goose-server/src/routes/dictation.rs
+++ b/crates/goose-server/src/routes/dictation.rs
@@ -146,14 +146,21 @@ async fn handle_response_error(response: reqwest::Response) -> ErrorResponse {
let error_text = response.text().await.unwrap_or_default();
ErrorResponse {
- message: if status == 401 || error_text.contains("Invalid API key") || error_text.contains("Unauthorized") {
+ message: if status == 401
+ || error_text.contains("Invalid API key")
+ || error_text.contains("Unauthorized")
+ {
"Invalid API key".to_string()
} else if status == 429 || error_text.contains("quota") || error_text.contains("limit") {
"Rate limit exceeded".to_string()
} else {
format!("API error: {}", error_text)
},
- status: if status.is_client_error() { status } else { StatusCode::BAD_GATEWAY },
+ status: if status.is_client_error() {
+ status
+ } else {
+ StatusCode::BAD_GATEWAY
+ },
}
}
@@ -162,10 +169,12 @@ fn build_api_client(provider: &str) -> Result {
let def = get_provider_def(provider)
.ok_or_else(|| ErrorResponse::bad_request(format!("Unknown provider: {}", provider)))?;
- let api_key = config.get_secret(def.config_key).map_err(|_| ErrorResponse {
- message: format!("{} not configured", def.config_key),
- status: StatusCode::PRECONDITION_FAILED,
- })?;
+ let api_key = config
+ .get_secret(def.config_key)
+ .map_err(|_| ErrorResponse {
+ message: format!("{} not configured", def.config_key),
+ status: StatusCode::PRECONDITION_FAILED,
+ })?;
let url = if let Some(host_key) = def.host_key {
config
@@ -173,7 +182,8 @@ fn build_api_client(provider: &str) -> Result {
.ok()
.and_then(|v| v.as_str().map(|s| s.to_string()))
.map(|custom_host| {
- let path = def.default_url
+ let path = def
+ .default_url
.splitn(4, '/')
.nth(3)
.map(|p| format!("/{}", p))
@@ -191,7 +201,12 @@ fn build_api_client(provider: &str) -> Result {
header_name: "xi-api-key".to_string(),
key: api_key,
},
- _ => return Err(ErrorResponse::bad_request(format!("Unknown provider: {}", provider))),
+ _ => {
+ return Err(ErrorResponse::bad_request(format!(
+ "Unknown provider: {}",
+ provider
+ )))
+ }
};
ApiClient::with_timeout(url, auth, REQUEST_TIMEOUT)