diff --git a/crates/goose-server/src/openapi.rs b/crates/goose-server/src/openapi.rs index 78158fd2e0c4..fa9f8a09564d 100644 --- a/crates/goose-server/src/openapi.rs +++ b/crates/goose-server/src/openapi.rs @@ -412,6 +412,8 @@ derive_utoipa!(Icon as IconSchema); super::routes::tunnel::stop_tunnel, super::routes::tunnel::get_tunnel_status, super::routes::telemetry::send_telemetry_event, + super::routes::dictation::transcribe_dictation, + super::routes::dictation::get_dictation_config, ), components(schemas( super::routes::config_management::UpsertConfigQuery, @@ -570,6 +572,10 @@ derive_utoipa!(Icon as IconSchema); goose::goose_apps::CspMetadata, goose::goose_apps::UiMetadata, goose::goose_apps::ResourceMetadata, + super::routes::dictation::TranscribeRequest, + super::routes::dictation::TranscribeResponse, + super::routes::dictation::DictationProvider, + super::routes::dictation::DictationProviderStatus, )) )] pub struct ApiDoc; diff --git a/crates/goose-server/src/routes/audio.rs b/crates/goose-server/src/routes/audio.rs deleted file mode 100644 index c0364ef3d768..000000000000 --- a/crates/goose-server/src/routes/audio.rs +++ /dev/null @@ -1,512 +0,0 @@ -/// Audio transcription route handler -/// -/// This module provides endpoints for audio transcription using OpenAI's Whisper API. -/// The OpenAI API key must be configured in the backend for this to work. -use crate::routes::errors::ErrorResponse; -use crate::state::AppState; -use axum::{ - http::StatusCode, - routing::{get, post}, - Json, Router, -}; -use base64::{engine::general_purpose::STANDARD as BASE64, Engine}; -use reqwest::Client; -use serde::{Deserialize, Serialize}; -use std::sync::Arc; -use std::time::Duration; - -// Constants -const MAX_AUDIO_SIZE_BYTES: usize = 25 * 1024 * 1024; // 25MB -const OPENAI_TIMEOUT_SECONDS: u64 = 30; - -#[derive(Debug, Deserialize)] -struct TranscribeRequest { - audio: String, // Base64 encoded audio data - mime_type: String, -} - -#[derive(Debug, Deserialize)] -struct TranscribeElevenLabsRequest { - audio: String, // Base64 encoded audio data - mime_type: String, -} - -#[derive(Debug, Serialize)] -struct TranscribeResponse { - text: String, -} - -#[derive(Debug, Deserialize)] -struct WhisperResponse { - text: String, -} - -/// Validate audio input and return decoded bytes and file extension -fn validate_audio_input( - audio: &str, - mime_type: &str, -) -> Result<(Vec, &'static str), ErrorResponse> { - // Decode the base64 audio data - let audio_bytes = BASE64 - .decode(audio) - .map_err(|_| ErrorResponse::bad_request("Invalid base64 audio data"))?; - - // Check file size - if audio_bytes.len() > MAX_AUDIO_SIZE_BYTES { - return Err(ErrorResponse { - message: format!( - "Audio file too large: {} bytes (max: {} bytes)", - audio_bytes.len(), - MAX_AUDIO_SIZE_BYTES - ), - status: StatusCode::PAYLOAD_TOO_LARGE, - }); - } - - // Determine file extension based on MIME type - let file_extension = match mime_type { - "audio/webm" => "webm", - "audio/webm;codecs=opus" => "webm", - "audio/mp4" => "mp4", - "audio/mpeg" => "mp3", - "audio/mpga" => "mpga", - "audio/m4a" => "m4a", - "audio/wav" => "wav", - "audio/x-wav" => "wav", - _ => { - return Err(ErrorResponse { - message: format!("Unsupported audio format: {}", mime_type), - status: StatusCode::UNSUPPORTED_MEDIA_TYPE, - }) - } - }; - - Ok((audio_bytes, file_extension)) -} - -/// Get OpenAI configuration (API key and host) -fn get_openai_config() -> Result<(String, String), ErrorResponse> { - let config = goose::config::Config::global(); - - let api_key: String = config.get_secret("OPENAI_API_KEY").map_err(|e| match e { - goose::config::ConfigError::NotFound(_) => ErrorResponse { - message: "OpenAI API key not configured. Please set OPENAI_API_KEY in settings." - .to_string(), - status: StatusCode::PRECONDITION_FAILED, - }, - _ => ErrorResponse::internal(format!("Failed to get OpenAI API key: {:?}", e)), - })?; - - let openai_host = match config.get("OPENAI_HOST", false) { - Ok(value) => value - .as_str() - .map(|s| s.to_string()) - .unwrap_or_else(|| "https://api.openai.com".to_string()), - Err(_) => "https://api.openai.com".to_string(), - }; - - Ok((api_key, openai_host)) -} - -/// Send transcription request to OpenAI Whisper API -async fn send_openai_request( - audio_bytes: Vec, - file_extension: &str, - mime_type: &str, - api_key: &str, - openai_host: &str, -) -> Result { - tracing::info!("Using OpenAI host: {}", openai_host); - tracing::info!( - "Audio file size: {} bytes, extension: {}, mime_type: {}", - audio_bytes.len(), - file_extension, - mime_type - ); - - // Create a multipart form with the audio file - let part = reqwest::multipart::Part::bytes(audio_bytes) - .file_name(format!("audio.{}", file_extension)) - .mime_str(mime_type) - .map_err(|e| { - ErrorResponse::internal(format!("Failed to create multipart part: {:?}", e)) - })?; - - let form = reqwest::multipart::Form::new() - .part("file", part) - .text("model", "whisper-1") - .text("response_format", "json"); - - tracing::info!("Created multipart form for OpenAI Whisper API"); - - // Make request to OpenAI Whisper API - let client = Client::builder() - .timeout(Duration::from_secs(OPENAI_TIMEOUT_SECONDS)) - .build() - .map_err(|e| ErrorResponse::internal(format!("Failed to create HTTP client: {}", e)))?; - - tracing::info!( - "Sending request to OpenAI: {}/v1/audio/transcriptions", - openai_host - ); - - let response = client - .post(format!("{}/v1/audio/transcriptions", openai_host)) - .header("Authorization", format!("Bearer {}", api_key)) - .multipart(form) - .send() - .await - .map_err(|e| { - if e.is_timeout() { - ErrorResponse { - message: format!( - "OpenAI API request timed out after {}s", - OPENAI_TIMEOUT_SECONDS - ), - status: StatusCode::GATEWAY_TIMEOUT, - } - } else { - ErrorResponse { - message: format!("Failed to send request to OpenAI: {}", e), - status: StatusCode::SERVICE_UNAVAILABLE, - } - } - })?; - - tracing::info!( - "Received response from OpenAI with status: {}", - response.status() - ); - - if !response.status().is_success() { - let status = response.status(); - let error_text = response.text().await.unwrap_or_default(); - tracing::error!("OpenAI API error (status: {}): {}", status, error_text); - - // Check for specific error codes - if status == 401 { - return Err(ErrorResponse { - message: "OpenAI API key appears to be invalid or unauthorized".to_string(), - status: StatusCode::UNAUTHORIZED, - }); - } else if status == 429 { - return Err(ErrorResponse { - message: "OpenAI API quota or rate limit exceeded".to_string(), - status: StatusCode::TOO_MANY_REQUESTS, - }); - } - - return Err(ErrorResponse { - message: format!("OpenAI API error: {}", error_text), - status: StatusCode::BAD_GATEWAY, - }); - } - - let whisper_response: WhisperResponse = response - .json() - .await - .map_err(|e| ErrorResponse::internal(format!("Failed to parse OpenAI response: {}", e)))?; - - Ok(whisper_response) -} - -/// Transcribe audio using OpenAI's Whisper API -/// -/// # Request -/// - `audio`: Base64 encoded audio data -/// - `mime_type`: MIME type of the audio (e.g., "audio/webm", "audio/wav") -/// -/// # Response -/// - `text`: Transcribed text from the audio -/// -/// # Errors -/// - 401: Unauthorized (missing or invalid X-Secret-Key header) -/// - 412: Precondition Failed (OpenAI API key not configured) -/// - 400: Bad Request (invalid base64 audio data) -/// - 413: Payload Too Large (audio file exceeds 25MB limit) -/// - 415: Unsupported Media Type (unsupported audio format) -/// - 502: Bad Gateway (OpenAI API error) -/// - 503: Service Unavailable (network error) -async fn transcribe_handler( - Json(request): Json, -) -> Result, ErrorResponse> { - let (audio_bytes, file_extension) = validate_audio_input(&request.audio, &request.mime_type)?; - let (api_key, openai_host) = get_openai_config()?; - - let whisper_response = send_openai_request( - audio_bytes, - file_extension, - &request.mime_type, - &api_key, - &openai_host, - ) - .await?; - - Ok(Json(TranscribeResponse { - text: whisper_response.text, - })) -} - -/// Transcribe audio using ElevenLabs Speech-to-Text API -/// -/// Uses ElevenLabs' speech-to-text endpoint for transcription. -/// Requires an ElevenLabs API key with speech-to-text access. -async fn transcribe_elevenlabs_handler( - Json(request): Json, -) -> Result, ErrorResponse> { - let (audio_bytes, file_extension) = validate_audio_input(&request.audio, &request.mime_type)?; - - // Get the ElevenLabs API key from config (after input validation) - let config = goose::config::Config::global(); - - // First try to get it as a secret - let api_key: String = match config.get_secret::("ELEVENLABS_API_KEY") { - Ok(key) => key, - Err(_) => { - // Try to get it as non-secret (for backward compatibility) - match config.get("ELEVENLABS_API_KEY", false) { - Ok(value) => { - match value.as_str() { - Some(key_str) => { - let key = key_str.to_string(); - // Migrate to secret storage - if let Err(e) = config.set( - "ELEVENLABS_API_KEY", - &serde_json::Value::String(key.clone()), - true, - ) { - tracing::error!("Failed to migrate ElevenLabs API key: {:?}", e); - } - // Delete the non-secret version - if let Err(e) = config.delete("ELEVENLABS_API_KEY") { - tracing::warn!( - "Failed to delete non-secret ElevenLabs API key: {:?}", - e - ); - } - key - } - None => { - return Err(ErrorResponse::bad_request(format!( - "ElevenLabs API key is not a string, found: {:?}", - value - ))); - } - } - } - Err(_) => { - return Err(ErrorResponse::bad_request( - "No ElevenLabs API key found in configuration", - )); - } - } - } - }; - - // Create multipart form for ElevenLabs API - let part = reqwest::multipart::Part::bytes(audio_bytes) - .file_name(format!("audio.{}", file_extension)) - .mime_str(&request.mime_type) - .map_err(|_| ErrorResponse::internal("Failed to create multipart part"))?; - - let form = reqwest::multipart::Form::new() - .part("file", part) // Changed from "audio" to "file" - .text("model_id", "scribe_v1") // Use the correct model_id for speech-to-text - .text("tag_audio_events", "false") - .text("diarize", "false"); - - // Make request to ElevenLabs Speech-to-Text API - let client = Client::builder() - .timeout(Duration::from_secs(OPENAI_TIMEOUT_SECONDS)) - .build() - .map_err(|e| ErrorResponse::internal(format!("Failed to create HTTP client: {}", e)))?; - - let response = client - .post("https://api.elevenlabs.io/v1/speech-to-text") - .header("xi-api-key", &api_key) - .multipart(form) - .send() - .await - .map_err(|e| { - if e.is_timeout() { - ErrorResponse { - message: format!( - "ElevenLabs API request timed out after {}s", - OPENAI_TIMEOUT_SECONDS - ), - status: StatusCode::GATEWAY_TIMEOUT, - } - } else { - ErrorResponse { - message: format!("Failed to send request to ElevenLabs: {}", e), - status: StatusCode::SERVICE_UNAVAILABLE, - } - } - })?; - - if !response.status().is_success() { - let status = response.status(); - let error_text = response.text().await.unwrap_or_default(); - tracing::error!("ElevenLabs API error (status: {}): {}", status, error_text); - - // Check for specific error codes - if error_text.contains("Unauthorized") || error_text.contains("Invalid API key") { - return Err(ErrorResponse { - message: "ElevenLabs API key is invalid or unauthorized".to_string(), - status: StatusCode::UNAUTHORIZED, - }); - } else if error_text.contains("quota") || error_text.contains("limit") { - return Err(ErrorResponse { - message: "ElevenLabs API quota or rate limit exceeded".to_string(), - status: StatusCode::PAYMENT_REQUIRED, - }); - } - - return Err(ErrorResponse { - message: format!("ElevenLabs API error: {}", error_text), - status: StatusCode::BAD_GATEWAY, - }); - } - - // Parse ElevenLabs response - #[derive(Debug, Deserialize)] - struct ElevenLabsResponse { - text: String, - #[serde(rename = "chunks")] - #[allow(dead_code)] - _chunks: Option>, - } - - let elevenlabs_response: ElevenLabsResponse = response.json().await.map_err(|e| { - ErrorResponse::internal(format!("Failed to parse ElevenLabs response: {}", e)) - })?; - - Ok(Json(TranscribeResponse { - text: elevenlabs_response.text, - })) -} - -/// Check if dictation providers are configured -/// -/// Returns configuration status for dictation providers -async fn check_dictation_config() -> Result, ErrorResponse> { - let config = goose::config::Config::global(); - - // Check if ElevenLabs API key is configured - let has_elevenlabs = match config.get_secret::("ELEVENLABS_API_KEY") { - Ok(_) => true, - Err(_) => { - // Check non-secret for backward compatibility - config.get("ELEVENLABS_API_KEY", false).is_ok() - } - }; - - Ok(Json(serde_json::json!({ - "elevenlabs": has_elevenlabs - }))) -} - -pub fn routes(state: Arc) -> Router { - Router::new() - .route("/audio/transcribe", post(transcribe_handler)) - .route( - "/audio/transcribe/elevenlabs", - post(transcribe_elevenlabs_handler), - ) - .route("/audio/config", get(check_dictation_config)) - .with_state(state) -} - -#[cfg(test)] -mod tests { - use super::*; - use axum::{body::Body, http::Request}; - use tower::ServiceExt; - use wiremock::matchers::{method, path}; - use wiremock::{Mock, MockServer, ResponseTemplate}; - - #[tokio::test(flavor = "multi_thread")] - async fn test_transcribe_endpoint_requires_auth() { - let mock_server = MockServer::start().await; - Mock::given(method("POST")) - .and(path("/v1/audio/transcriptions")) - .respond_with(ResponseTemplate::new(401)) - .mount(&mock_server) - .await; - - let _guard = env_lock::lock_env([ - ("OPENAI_API_KEY", Some("fake-key")), - ("OPENAI_HOST", Some(mock_server.uri().as_str())), - ]); - - let state = AppState::new().await.unwrap(); - let app = routes(state); - let request = Request::builder() - .uri("/audio/transcribe") - .method("POST") - .header("content-type", "application/json") - .body(Body::from( - serde_json::to_string(&serde_json::json!({ - "audio": "dGVzdA==", - "mime_type": "audio/webm" - })) - .unwrap(), - )) - .unwrap(); - - let response = app.oneshot(request).await.unwrap(); - assert_eq!(response.status(), StatusCode::UNAUTHORIZED); - } - - #[tokio::test(flavor = "multi_thread")] - async fn test_transcribe_endpoint_validates_size() { - let state = AppState::new().await.unwrap(); - let app = routes(state); - - let request = Request::builder() - .uri("/audio/transcribe") - .method("POST") - .header("content-type", "application/json") - .header("x-secret-key", "test-secret") - .body(Body::from( - serde_json::to_string(&serde_json::json!({ - "audio": "dGVzdA==", - "mime_type": "application/pdf" // Invalid MIME type - })) - .unwrap(), - )) - .unwrap(); - - let response = app.oneshot(request).await.unwrap(); - assert!( - response.status() == StatusCode::UNSUPPORTED_MEDIA_TYPE - || response.status() == StatusCode::PRECONDITION_FAILED - ); - } - - #[tokio::test(flavor = "multi_thread")] - async fn test_transcribe_endpoint_validates_mime_type() { - let state = AppState::new().await.unwrap(); - let app = routes(state); - - let request = Request::builder() - .uri("/audio/transcribe") - .method("POST") - .header("content-type", "application/json") - .header("x-secret-key", "test-secret") - .body(Body::from( - serde_json::to_string(&serde_json::json!({ - "audio": "invalid-base64-!@#$%", - "mime_type": "audio/webm" - })) - .unwrap(), - )) - .unwrap(); - - let response = app.oneshot(request).await.unwrap(); - assert!( - response.status() == StatusCode::BAD_REQUEST - || response.status() == StatusCode::PRECONDITION_FAILED - ); - } -} diff --git a/crates/goose-server/src/routes/dictation.rs b/crates/goose-server/src/routes/dictation.rs new file mode 100644 index 000000000000..5112fd0d044f --- /dev/null +++ b/crates/goose-server/src/routes/dictation.rs @@ -0,0 +1,389 @@ +use crate::routes::errors::ErrorResponse; +use crate::state::AppState; +use axum::{ + http::StatusCode, + routing::{get, post}, + Json, Router, +}; +use base64::{engine::general_purpose::STANDARD as BASE64, Engine}; +use goose::providers::api_client::{ApiClient, AuthMethod}; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use std::sync::Arc; +use std::time::Duration; +use utoipa::ToSchema; + +const MAX_AUDIO_SIZE_BYTES: usize = 25 * 1024 * 1024; +const REQUEST_TIMEOUT: Duration = Duration::from_secs(30); + +// DictationProvider definitions +struct DictationProviderDef { + config_key: &'static str, + default_url: &'static str, + host_key: Option<&'static str>, + description: &'static str, + uses_provider_config: bool, + settings_path: Option<&'static str>, +} + +const PROVIDERS: &[(&str, DictationProviderDef)] = &[ + ( + "openai", + DictationProviderDef { + config_key: "OPENAI_API_KEY", + default_url: "https://api.openai.com/v1/audio/transcriptions", + host_key: Some("OPENAI_HOST"), + description: "Uses OpenAI Whisper API for high-quality transcription.", + uses_provider_config: true, + settings_path: Some("Settings > Models"), + }, + ), + ( + "elevenlabs", + DictationProviderDef { + config_key: "ELEVENLABS_API_KEY", + default_url: "https://api.elevenlabs.io/v1/speech-to-text", + host_key: None, + description: "Uses ElevenLabs speech-to-text API for advanced voice processing.", + uses_provider_config: false, + settings_path: None, + }, + ), +]; + +fn get_provider_def(name: &str) -> Option<&'static DictationProviderDef> { + PROVIDERS + .iter() + .find_map(|(n, def)| if *n == name { Some(def) } else { None }) +} + +#[derive(Debug, Deserialize, ToSchema)] +#[serde(rename_all = "lowercase")] +pub enum DictationProvider { + OpenAI, + ElevenLabs, +} + +impl DictationProvider { + fn as_str(&self) -> &'static str { + match self { + DictationProvider::OpenAI => "openai", + DictationProvider::ElevenLabs => "elevenlabs", + } + } +} + +#[derive(Debug, Deserialize, ToSchema)] +pub struct TranscribeRequest { + /// Base64 encoded audio data + pub audio: String, + /// MIME type of the audio (e.g., "audio/webm", "audio/wav") + pub mime_type: String, + /// Transcription provider to use + pub provider: DictationProvider, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +pub struct TranscribeResponse { + /// Transcribed text from the audio + pub text: String, +} + +#[derive(Debug, Serialize, ToSchema)] +pub struct DictationProviderStatus { + /// Whether the provider is fully configured and ready to use + pub configured: bool, + /// Custom host URL if configured (only for providers that support it) + #[serde(skip_serializing_if = "Option::is_none")] + pub host: Option, + /// Description of what this provider does + pub description: String, + /// Whether this provider uses the main provider config (true) or has its own key (false) + pub uses_provider_config: bool, + /// Path to settings if uses_provider_config is true + #[serde(skip_serializing_if = "Option::is_none")] + pub settings_path: Option, + /// Config key name if uses_provider_config is false + #[serde(skip_serializing_if = "Option::is_none")] + pub config_key: Option, +} + +fn validate_audio(audio: &str, mime_type: &str) -> Result<(Vec, &'static str), ErrorResponse> { + let audio_bytes = BASE64 + .decode(audio) + .map_err(|_| ErrorResponse::bad_request("Invalid base64 audio data"))?; + + if audio_bytes.len() > MAX_AUDIO_SIZE_BYTES { + return Err(ErrorResponse { + message: format!( + "Audio file too large: {} bytes (max: {} bytes)", + audio_bytes.len(), + MAX_AUDIO_SIZE_BYTES + ), + status: StatusCode::PAYLOAD_TOO_LARGE, + }); + } + + let extension = match mime_type { + "audio/webm" | "audio/webm;codecs=opus" => "webm", + "audio/mp4" => "mp4", + "audio/mpeg" | "audio/mpga" => "mp3", + "audio/m4a" => "m4a", + "audio/wav" | "audio/x-wav" => "wav", + _ => { + return Err(ErrorResponse { + message: format!("Unsupported audio format: {}", mime_type), + status: StatusCode::UNSUPPORTED_MEDIA_TYPE, + }) + } + }; + + Ok((audio_bytes, extension)) +} + +async fn handle_response_error(response: reqwest::Response) -> ErrorResponse { + let status = response.status(); + let error_text = response.text().await.unwrap_or_default(); + + ErrorResponse { + message: if status == 401 + || error_text.contains("Invalid API key") + || error_text.contains("Unauthorized") + { + "Invalid API key".to_string() + } else if status == 429 || error_text.contains("quota") || error_text.contains("limit") { + "Rate limit exceeded".to_string() + } else { + format!("API error: {}", error_text) + }, + status: if status.is_client_error() { + status + } else { + StatusCode::BAD_GATEWAY + }, + } +} + +fn build_api_client(provider: &str) -> Result { + let config = goose::config::Config::global(); + let def = get_provider_def(provider) + .ok_or_else(|| ErrorResponse::bad_request(format!("Unknown provider: {}", provider)))?; + + let api_key = config + .get_secret(def.config_key) + .map_err(|_| ErrorResponse { + message: format!("{} not configured", def.config_key), + status: StatusCode::PRECONDITION_FAILED, + })?; + + let url = if let Some(host_key) = def.host_key { + config + .get(host_key, false) + .ok() + .and_then(|v| v.as_str().map(|s| s.to_string())) + .map(|custom_host| { + let path = def + .default_url + .splitn(4, '/') + .nth(3) + .map(|p| format!("/{}", p)) + .unwrap_or_default(); + format!("{}{}", custom_host.trim_end_matches('/'), path) + }) + .unwrap_or_else(|| def.default_url.to_string()) + } else { + def.default_url.to_string() + }; + + let auth = match provider { + "openai" => AuthMethod::BearerToken(api_key), + "elevenlabs" => AuthMethod::ApiKey { + header_name: "xi-api-key".to_string(), + key: api_key, + }, + _ => { + return Err(ErrorResponse::bad_request(format!( + "Unknown provider: {}", + provider + ))) + } + }; + + ApiClient::with_timeout(url, auth, REQUEST_TIMEOUT) + .map_err(|e| ErrorResponse::internal(format!("Failed to create client: {}", e))) +} + +async fn transcribe_openai( + audio_bytes: Vec, + extension: &str, + mime_type: &str, + client: &ApiClient, +) -> Result { + let part = reqwest::multipart::Part::bytes(audio_bytes) + .file_name(format!("audio.{}", extension)) + .mime_str(mime_type) + .map_err(|e| ErrorResponse::internal(format!("Failed to create multipart: {}", e)))?; + + let form = reqwest::multipart::Form::new() + .part("file", part) + .text("model", "whisper-1"); + + let response = client + .request(None, "") + .multipart_post(form) + .await + .map_err(|e| ErrorResponse { + message: if e.to_string().contains("timeout") { + "Request timed out".to_string() + } else { + format!("Request failed: {}", e) + }, + status: if e.to_string().contains("timeout") { + StatusCode::GATEWAY_TIMEOUT + } else { + StatusCode::SERVICE_UNAVAILABLE + }, + })?; + + if !response.status().is_success() { + return Err(handle_response_error(response).await); + } + + let data: TranscribeResponse = response + .json() + .await + .map_err(|e| ErrorResponse::internal(format!("Failed to parse response: {}", e)))?; + + Ok(data.text) +} + +async fn transcribe_elevenlabs( + audio_bytes: Vec, + extension: &str, + mime_type: &str, + client: &ApiClient, +) -> Result { + let part = reqwest::multipart::Part::bytes(audio_bytes) + .file_name(format!("audio.{}", extension)) + .mime_str(mime_type) + .map_err(|_| ErrorResponse::internal("Failed to create multipart"))?; + + let form = reqwest::multipart::Form::new() + .part("file", part) + .text("model_id", "scribe_v1"); + + let response = client + .request(None, "") + .multipart_post(form) + .await + .map_err(|e| ErrorResponse { + message: if e.to_string().contains("timeout") { + "Request timed out".to_string() + } else { + format!("Request failed: {}", e) + }, + status: if e.to_string().contains("timeout") { + StatusCode::GATEWAY_TIMEOUT + } else { + StatusCode::SERVICE_UNAVAILABLE + }, + })?; + + if !response.status().is_success() { + return Err(handle_response_error(response).await); + } + + let data: TranscribeResponse = response + .json() + .await + .map_err(|e| ErrorResponse::internal(format!("Failed to parse response: {}", e)))?; + + Ok(data.text) +} + +#[utoipa::path( + post, + path = "/dictation/transcribe", + request_body = TranscribeRequest, + responses( + (status = 200, description = "Audio transcribed successfully", body = TranscribeResponse), + (status = 400, description = "Invalid request (bad base64 or unsupported format)"), + (status = 401, description = "Invalid API key"), + (status = 412, description = "DictationProvider not configured"), + (status = 413, description = "Audio file too large (max 25MB)"), + (status = 429, description = "Rate limit exceeded"), + (status = 500, description = "Internal server error"), + (status = 502, description = "DictationProvider API error"), + (status = 503, description = "Service unavailable"), + (status = 504, description = "Request timeout") + ) +)] +pub async fn transcribe_dictation( + Json(request): Json, +) -> Result, ErrorResponse> { + let (audio_bytes, extension) = validate_audio(&request.audio, &request.mime_type)?; + let provider_name = request.provider.as_str(); + let client = build_api_client(provider_name)?; + + let text = match request.provider { + DictationProvider::OpenAI => { + transcribe_openai(audio_bytes, extension, &request.mime_type, &client).await? + } + DictationProvider::ElevenLabs => { + transcribe_elevenlabs(audio_bytes, extension, &request.mime_type, &client).await? + } + }; + + Ok(Json(TranscribeResponse { text })) +} + +#[utoipa::path( + get, + path = "/dictation/config", + responses( + (status = 200, description = "Audio transcription provider configurations", body = HashMap) + ) +)] +pub async fn get_dictation_config( +) -> Result>, ErrorResponse> { + let config = goose::config::Config::global(); + let mut providers = HashMap::new(); + + for (name, def) in PROVIDERS.iter() { + let configured = config.get_secret::(def.config_key).is_ok(); + + let host = if let Some(host_key) = def.host_key { + config + .get(host_key, false) + .ok() + .and_then(|v| v.as_str().map(|s| s.to_string())) + } else { + None + }; + + providers.insert( + name.to_string(), + DictationProviderStatus { + configured, + host, + description: def.description.to_string(), + uses_provider_config: def.uses_provider_config, + settings_path: def.settings_path.map(|s| s.to_string()), + config_key: if !def.uses_provider_config { + Some(def.config_key.to_string()) + } else { + None + }, + }, + ); + } + + Ok(Json(providers)) +} + +pub fn routes(state: Arc) -> Router { + Router::new() + .route("/dictation/transcribe", post(transcribe_dictation)) + .route("/dictation/config", get(get_dictation_config)) + .with_state(state) +} diff --git a/crates/goose-server/src/routes/mod.rs b/crates/goose-server/src/routes/mod.rs index d65241cc0d75..e0935c2476a8 100644 --- a/crates/goose-server/src/routes/mod.rs +++ b/crates/goose-server/src/routes/mod.rs @@ -1,7 +1,7 @@ pub mod action_required; pub mod agent; -pub mod audio; pub mod config_management; +pub mod dictation; pub mod errors; pub mod mcp_app_proxy; pub mod mcp_ui_proxy; @@ -28,7 +28,7 @@ pub fn configure(state: Arc, secret_key: String) -> Rout .merge(reply::routes(state.clone())) .merge(action_required::routes(state.clone())) .merge(agent::routes(state.clone())) - .merge(audio::routes(state.clone())) + .merge(dictation::routes(state.clone())) .merge(config_management::routes(state.clone())) .merge(prompts::routes()) .merge(recipe::routes(state.clone())) diff --git a/crates/goose/src/providers/api_client.rs b/crates/goose/src/providers/api_client.rs index 627f5435d535..4524f1a9ee36 100644 --- a/crates/goose/src/providers/api_client.rs +++ b/crates/goose/src/providers/api_client.rs @@ -365,6 +365,11 @@ impl<'a> ApiRequestBuilder<'a> { Ok(request.json(payload).send().await?) } + pub async fn multipart_post(self, form: reqwest::multipart::Form) -> Result { + let request = self.send_request(|url, client| client.post(url)).await?; + Ok(request.multipart(form).send().await?) + } + pub async fn api_get(self) -> Result { let response = self.response_get().await?; ApiResponse::from_response(response).await diff --git a/ui/desktop/openapi.json b/ui/desktop/openapi.json index 1b56e711b402..806cb04ba92c 100644 --- a/ui/desktop/openapi.json +++ b/ui/desktop/openapi.json @@ -1571,6 +1571,86 @@ } } }, + "/dictation/config": { + "get": { + "tags": [ + "super::routes::dictation" + ], + "operationId": "get_dictation_config", + "responses": { + "200": { + "description": "Audio transcription provider configurations", + "content": { + "application/json": { + "schema": { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/DictationProviderStatus" + } + } + } + } + } + } + } + }, + "/dictation/transcribe": { + "post": { + "tags": [ + "super::routes::dictation" + ], + "operationId": "transcribe_dictation", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/TranscribeRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Audio transcribed successfully", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/TranscribeResponse" + } + } + } + }, + "400": { + "description": "Invalid request (bad base64 or unsupported format)" + }, + "401": { + "description": "Invalid API key" + }, + "412": { + "description": "DictationProvider not configured" + }, + "413": { + "description": "Audio file too large (max 25MB)" + }, + "429": { + "description": "Rate limit exceeded" + }, + "500": { + "description": "Internal server error" + }, + "502": { + "description": "DictationProvider API error" + }, + "503": { + "description": "Service unavailable" + }, + "504": { + "description": "Request timeout" + } + } + } + }, "/handle_openrouter": { "post": { "tags": [ @@ -3593,6 +3673,50 @@ } } }, + "DictationProvider": { + "type": "string", + "enum": [ + "openai", + "elevenlabs" + ] + }, + "DictationProviderStatus": { + "type": "object", + "required": [ + "configured", + "description", + "uses_provider_config" + ], + "properties": { + "config_key": { + "type": "string", + "description": "Config key name if uses_provider_config is false", + "nullable": true + }, + "configured": { + "type": "boolean", + "description": "Whether the provider is fully configured and ready to use" + }, + "description": { + "type": "string", + "description": "Description of what this provider does" + }, + "host": { + "type": "string", + "description": "Custom host URL if configured (only for providers that support it)", + "nullable": true + }, + "settings_path": { + "type": "string", + "description": "Path to settings if uses_provider_config is true", + "nullable": true + }, + "uses_provider_config": { + "type": "boolean", + "description": "Whether this provider uses the main provider config (true) or has its own key (false)" + } + } + }, "EmbeddedResource": { "type": "object", "required": [ @@ -6612,6 +6736,39 @@ } } }, + "TranscribeRequest": { + "type": "object", + "required": [ + "audio", + "mime_type", + "provider" + ], + "properties": { + "audio": { + "type": "string", + "description": "Base64 encoded audio data" + }, + "mime_type": { + "type": "string", + "description": "MIME type of the audio (e.g., \"audio/webm\", \"audio/wav\")" + }, + "provider": { + "$ref": "#/components/schemas/DictationProvider" + } + } + }, + "TranscribeResponse": { + "type": "object", + "required": [ + "text" + ], + "properties": { + "text": { + "type": "string", + "description": "Transcribed text from the audio" + } + } + }, "TunnelInfo": { "type": "object", "required": [ diff --git a/ui/desktop/src/api/index.ts b/ui/desktop/src/api/index.ts index 4ff4623ae0a1..0b6cd8f14735 100644 --- a/ui/desktop/src/api/index.ts +++ b/ui/desktop/src/api/index.ts @@ -1,4 +1,4 @@ // This file is auto-generated by @hey-api/openapi-ts -export { addExtension, agentAddExtension, agentRemoveExtension, backupConfig, callTool, checkProvider, configureProviderOauth, confirmToolAction, createCustomProvider, createRecipe, createSchedule, decodeRecipe, deleteRecipe, deleteSchedule, deleteSession, detectProvider, diagnostics, encodeRecipe, exportApp, exportSession, forkSession, getCustomProvider, getExtensions, getPricing, getPrompt, getPrompts, getProviderModels, getSession, getSessionExtensions, getSessionInsights, getSlashCommands, getTools, getTunnelStatus, importApp, importSession, initConfig, inspectRunningJob, killRunningJob, listApps, listRecipes, listSchedules, listSessions, mcpUiProxy, type Options, parseRecipe, pauseSchedule, providers, readAllConfig, readConfig, readResource, recipeToYaml, recoverConfig, removeConfig, removeCustomProvider, removeExtension, reply, resetPrompt, restartAgent, resumeAgent, runNowHandler, savePrompt, saveRecipe, scanRecipe, scheduleRecipe, sendTelemetryEvent, sessionsHandler, setConfigProvider, setRecipeSlashCommand, startAgent, startOpenrouterSetup, startTetrateSetup, startTunnel, status, stopAgent, stopTunnel, systemInfo, unpauseSchedule, updateAgentProvider, updateCustomProvider, updateFromSession, updateSchedule, updateSessionName, updateSessionUserRecipeValues, updateWorkingDir, upsertConfig, upsertPermissions, validateConfig } from './sdk.gen'; -export type { ActionRequired, ActionRequiredData, AddExtensionData, AddExtensionErrors, AddExtensionRequest, AddExtensionResponse, AddExtensionResponses, AgentAddExtensionData, AgentAddExtensionErrors, AgentAddExtensionResponse, AgentAddExtensionResponses, AgentRemoveExtensionData, AgentRemoveExtensionErrors, AgentRemoveExtensionResponse, AgentRemoveExtensionResponses, Annotations, Author, AuthorRequest, BackupConfigData, BackupConfigErrors, BackupConfigResponse, BackupConfigResponses, CallToolData, CallToolErrors, CallToolRequest, CallToolResponse, CallToolResponse2, CallToolResponses, ChatRequest, CheckProviderData, CheckProviderRequest, ClientOptions, CommandType, ConfigKey, ConfigKeyQuery, ConfigResponse, ConfigureProviderOauthData, ConfigureProviderOauthErrors, ConfigureProviderOauthResponses, ConfirmToolActionData, ConfirmToolActionErrors, ConfirmToolActionRequest, ConfirmToolActionResponses, Content, Conversation, CreateCustomProviderData, CreateCustomProviderErrors, CreateCustomProviderResponse, CreateCustomProviderResponses, CreateRecipeData, CreateRecipeErrors, CreateRecipeRequest, CreateRecipeResponse, CreateRecipeResponse2, CreateRecipeResponses, CreateScheduleData, CreateScheduleErrors, CreateScheduleRequest, CreateScheduleResponse, CreateScheduleResponses, CspMetadata, DeclarativeProviderConfig, DecodeRecipeData, DecodeRecipeErrors, DecodeRecipeRequest, DecodeRecipeResponse, DecodeRecipeResponse2, DecodeRecipeResponses, DeleteRecipeData, DeleteRecipeErrors, DeleteRecipeRequest, DeleteRecipeResponse, DeleteRecipeResponses, DeleteScheduleData, DeleteScheduleErrors, DeleteScheduleResponse, DeleteScheduleResponses, DeleteSessionData, DeleteSessionErrors, DeleteSessionResponses, DetectProviderData, DetectProviderErrors, DetectProviderRequest, DetectProviderResponse, DetectProviderResponse2, DetectProviderResponses, DiagnosticsData, DiagnosticsErrors, DiagnosticsResponse, DiagnosticsResponses, EmbeddedResource, EncodeRecipeData, EncodeRecipeErrors, EncodeRecipeRequest, EncodeRecipeResponse, EncodeRecipeResponse2, EncodeRecipeResponses, Envs, ErrorResponse, ExportAppData, ExportAppError, ExportAppErrors, ExportAppResponse, ExportAppResponses, ExportSessionData, ExportSessionErrors, ExportSessionResponse, ExportSessionResponses, ExtensionConfig, ExtensionData, ExtensionEntry, ExtensionLoadResult, ExtensionQuery, ExtensionResponse, ForkRequest, ForkResponse, ForkSessionData, ForkSessionErrors, ForkSessionResponse, ForkSessionResponses, FrontendToolRequest, GetCustomProviderData, GetCustomProviderErrors, GetCustomProviderResponse, GetCustomProviderResponses, GetExtensionsData, GetExtensionsErrors, GetExtensionsResponse, GetExtensionsResponses, GetPricingData, GetPricingResponse, GetPricingResponses, GetPromptData, GetPromptErrors, GetPromptResponse, GetPromptResponses, GetPromptsData, GetPromptsResponse, GetPromptsResponses, GetProviderModelsData, GetProviderModelsErrors, GetProviderModelsResponse, GetProviderModelsResponses, GetSessionData, GetSessionErrors, GetSessionExtensionsData, GetSessionExtensionsErrors, GetSessionExtensionsResponse, GetSessionExtensionsResponses, GetSessionInsightsData, GetSessionInsightsErrors, GetSessionInsightsResponse, GetSessionInsightsResponses, GetSessionResponse, GetSessionResponses, GetSlashCommandsData, GetSlashCommandsResponse, GetSlashCommandsResponses, GetToolsData, GetToolsErrors, GetToolsQuery, GetToolsResponse, GetToolsResponses, GetTunnelStatusData, GetTunnelStatusResponse, GetTunnelStatusResponses, GooseApp, Icon, ImageContent, ImportAppData, ImportAppError, ImportAppErrors, ImportAppRequest, ImportAppResponse, ImportAppResponse2, ImportAppResponses, ImportSessionData, ImportSessionErrors, ImportSessionRequest, ImportSessionResponse, ImportSessionResponses, InitConfigData, InitConfigErrors, InitConfigResponse, InitConfigResponses, InspectJobResponse, InspectRunningJobData, InspectRunningJobErrors, InspectRunningJobResponse, InspectRunningJobResponses, JsonObject, KillJobResponse, KillRunningJobData, KillRunningJobResponses, ListAppsData, ListAppsError, ListAppsErrors, ListAppsRequest, ListAppsResponse, ListAppsResponse2, ListAppsResponses, ListRecipeResponse, ListRecipesData, ListRecipesErrors, ListRecipesResponse, ListRecipesResponses, ListSchedulesData, ListSchedulesErrors, ListSchedulesResponse, ListSchedulesResponse2, ListSchedulesResponses, ListSessionsData, ListSessionsErrors, ListSessionsResponse, ListSessionsResponses, LoadedProvider, McpAppResource, McpUiProxyData, McpUiProxyErrors, McpUiProxyResponses, Message, MessageContent, MessageEvent, MessageMetadata, ModelConfig, ModelInfo, ParseRecipeData, ParseRecipeError, ParseRecipeErrors, ParseRecipeRequest, ParseRecipeResponse, ParseRecipeResponse2, ParseRecipeResponses, PauseScheduleData, PauseScheduleErrors, PauseScheduleResponse, PauseScheduleResponses, PermissionLevel, PricingData, PricingQuery, PricingResponse, PrincipalType, PromptContentResponse, PromptsListResponse, ProviderDetails, ProviderEngine, ProviderMetadata, ProvidersData, ProvidersResponse, ProvidersResponse2, ProvidersResponses, ProviderType, RawAudioContent, RawEmbeddedResource, RawImageContent, RawResource, RawTextContent, ReadAllConfigData, ReadAllConfigResponse, ReadAllConfigResponses, ReadConfigData, ReadConfigErrors, ReadConfigResponses, ReadResourceData, ReadResourceErrors, ReadResourceRequest, ReadResourceResponse, ReadResourceResponse2, ReadResourceResponses, Recipe, RecipeManifest, RecipeParameter, RecipeParameterInputType, RecipeParameterRequirement, RecipeToYamlData, RecipeToYamlError, RecipeToYamlErrors, RecipeToYamlRequest, RecipeToYamlResponse, RecipeToYamlResponse2, RecipeToYamlResponses, RecoverConfigData, RecoverConfigErrors, RecoverConfigResponse, RecoverConfigResponses, RedactedThinkingContent, RemoveConfigData, RemoveConfigErrors, RemoveConfigResponse, RemoveConfigResponses, RemoveCustomProviderData, RemoveCustomProviderErrors, RemoveCustomProviderResponse, RemoveCustomProviderResponses, RemoveExtensionData, RemoveExtensionErrors, RemoveExtensionRequest, RemoveExtensionResponse, RemoveExtensionResponses, ReplyData, ReplyErrors, ReplyResponse, ReplyResponses, ResetPromptData, ResetPromptErrors, ResetPromptResponse, ResetPromptResponses, ResourceContents, ResourceMetadata, Response, RestartAgentData, RestartAgentErrors, RestartAgentRequest, RestartAgentResponse, RestartAgentResponse2, RestartAgentResponses, ResumeAgentData, ResumeAgentErrors, ResumeAgentRequest, ResumeAgentResponse, ResumeAgentResponse2, ResumeAgentResponses, RetryConfig, Role, RunNowHandlerData, RunNowHandlerErrors, RunNowHandlerResponse, RunNowHandlerResponses, RunNowResponse, SavePromptData, SavePromptErrors, SavePromptRequest, SavePromptResponse, SavePromptResponses, SaveRecipeData, SaveRecipeError, SaveRecipeErrors, SaveRecipeRequest, SaveRecipeResponse, SaveRecipeResponse2, SaveRecipeResponses, ScanRecipeData, ScanRecipeRequest, ScanRecipeResponse, ScanRecipeResponse2, ScanRecipeResponses, ScheduledJob, ScheduleRecipeData, ScheduleRecipeErrors, ScheduleRecipeRequest, ScheduleRecipeResponses, SendTelemetryEventData, SendTelemetryEventResponses, Session, SessionDisplayInfo, SessionExtensionsResponse, SessionInsights, SessionListResponse, SessionsHandlerData, SessionsHandlerErrors, SessionsHandlerResponse, SessionsHandlerResponses, SessionsQuery, SessionType, SetConfigProviderData, SetProviderRequest, SetRecipeSlashCommandData, SetRecipeSlashCommandErrors, SetRecipeSlashCommandResponses, SetSlashCommandRequest, Settings, SetupResponse, SlashCommand, SlashCommandsResponse, StartAgentData, StartAgentError, StartAgentErrors, StartAgentRequest, StartAgentResponse, StartAgentResponses, StartOpenrouterSetupData, StartOpenrouterSetupResponse, StartOpenrouterSetupResponses, StartTetrateSetupData, StartTetrateSetupResponse, StartTetrateSetupResponses, StartTunnelData, StartTunnelError, StartTunnelErrors, StartTunnelResponse, StartTunnelResponses, StatusData, StatusResponse, StatusResponses, StopAgentData, StopAgentErrors, StopAgentRequest, StopAgentResponse, StopAgentResponses, StopTunnelData, StopTunnelError, StopTunnelErrors, StopTunnelResponses, SubRecipe, SuccessCheck, SystemInfo, SystemInfoData, SystemInfoResponse, SystemInfoResponses, SystemNotificationContent, SystemNotificationType, TelemetryEventRequest, Template, TextContent, ThinkingContent, TokenState, Tool, ToolAnnotations, ToolConfirmationRequest, ToolInfo, ToolPermission, ToolRequest, ToolResponse, TunnelInfo, TunnelState, UiMetadata, UnpauseScheduleData, UnpauseScheduleErrors, UnpauseScheduleResponse, UnpauseScheduleResponses, UpdateAgentProviderData, UpdateAgentProviderErrors, UpdateAgentProviderResponses, UpdateCustomProviderData, UpdateCustomProviderErrors, UpdateCustomProviderRequest, UpdateCustomProviderResponse, UpdateCustomProviderResponses, UpdateFromSessionData, UpdateFromSessionErrors, UpdateFromSessionRequest, UpdateFromSessionResponses, UpdateProviderRequest, UpdateScheduleData, UpdateScheduleErrors, UpdateScheduleRequest, UpdateScheduleResponse, UpdateScheduleResponses, UpdateSessionNameData, UpdateSessionNameErrors, UpdateSessionNameRequest, UpdateSessionNameResponses, UpdateSessionUserRecipeValuesData, UpdateSessionUserRecipeValuesError, UpdateSessionUserRecipeValuesErrors, UpdateSessionUserRecipeValuesRequest, UpdateSessionUserRecipeValuesResponse, UpdateSessionUserRecipeValuesResponse2, UpdateSessionUserRecipeValuesResponses, UpdateWorkingDirData, UpdateWorkingDirErrors, UpdateWorkingDirRequest, UpdateWorkingDirResponses, UpsertConfigData, UpsertConfigErrors, UpsertConfigQuery, UpsertConfigResponse, UpsertConfigResponses, UpsertPermissionsData, UpsertPermissionsErrors, UpsertPermissionsQuery, UpsertPermissionsResponse, UpsertPermissionsResponses, ValidateConfigData, ValidateConfigErrors, ValidateConfigResponse, ValidateConfigResponses, WindowProps } from './types.gen'; +export { addExtension, agentAddExtension, agentRemoveExtension, backupConfig, callTool, checkProvider, configureProviderOauth, confirmToolAction, createCustomProvider, createRecipe, createSchedule, decodeRecipe, deleteRecipe, deleteSchedule, deleteSession, detectProvider, diagnostics, encodeRecipe, exportApp, exportSession, forkSession, getCustomProvider, getDictationConfig, getExtensions, getPricing, getPrompt, getPrompts, getProviderModels, getSession, getSessionExtensions, getSessionInsights, getSlashCommands, getTools, getTunnelStatus, importApp, importSession, initConfig, inspectRunningJob, killRunningJob, listApps, listRecipes, listSchedules, listSessions, mcpUiProxy, type Options, parseRecipe, pauseSchedule, providers, readAllConfig, readConfig, readResource, recipeToYaml, recoverConfig, removeConfig, removeCustomProvider, removeExtension, reply, resetPrompt, restartAgent, resumeAgent, runNowHandler, savePrompt, saveRecipe, scanRecipe, scheduleRecipe, sendTelemetryEvent, sessionsHandler, setConfigProvider, setRecipeSlashCommand, startAgent, startOpenrouterSetup, startTetrateSetup, startTunnel, status, stopAgent, stopTunnel, systemInfo, transcribeDictation, unpauseSchedule, updateAgentProvider, updateCustomProvider, updateFromSession, updateSchedule, updateSessionName, updateSessionUserRecipeValues, updateWorkingDir, upsertConfig, upsertPermissions, validateConfig } from './sdk.gen'; +export type { ActionRequired, ActionRequiredData, AddExtensionData, AddExtensionErrors, AddExtensionRequest, AddExtensionResponse, AddExtensionResponses, AgentAddExtensionData, AgentAddExtensionErrors, AgentAddExtensionResponse, AgentAddExtensionResponses, AgentRemoveExtensionData, AgentRemoveExtensionErrors, AgentRemoveExtensionResponse, AgentRemoveExtensionResponses, Annotations, Author, AuthorRequest, BackupConfigData, BackupConfigErrors, BackupConfigResponse, BackupConfigResponses, CallToolData, CallToolErrors, CallToolRequest, CallToolResponse, CallToolResponse2, CallToolResponses, ChatRequest, CheckProviderData, CheckProviderRequest, ClientOptions, CommandType, ConfigKey, ConfigKeyQuery, ConfigResponse, ConfigureProviderOauthData, ConfigureProviderOauthErrors, ConfigureProviderOauthResponses, ConfirmToolActionData, ConfirmToolActionErrors, ConfirmToolActionRequest, ConfirmToolActionResponses, Content, Conversation, CreateCustomProviderData, CreateCustomProviderErrors, CreateCustomProviderResponse, CreateCustomProviderResponses, CreateRecipeData, CreateRecipeErrors, CreateRecipeRequest, CreateRecipeResponse, CreateRecipeResponse2, CreateRecipeResponses, CreateScheduleData, CreateScheduleErrors, CreateScheduleRequest, CreateScheduleResponse, CreateScheduleResponses, CspMetadata, DeclarativeProviderConfig, DecodeRecipeData, DecodeRecipeErrors, DecodeRecipeRequest, DecodeRecipeResponse, DecodeRecipeResponse2, DecodeRecipeResponses, DeleteRecipeData, DeleteRecipeErrors, DeleteRecipeRequest, DeleteRecipeResponse, DeleteRecipeResponses, DeleteScheduleData, DeleteScheduleErrors, DeleteScheduleResponse, DeleteScheduleResponses, DeleteSessionData, DeleteSessionErrors, DeleteSessionResponses, DetectProviderData, DetectProviderErrors, DetectProviderRequest, DetectProviderResponse, DetectProviderResponse2, DetectProviderResponses, DiagnosticsData, DiagnosticsErrors, DiagnosticsResponse, DiagnosticsResponses, DictationProvider, DictationProviderStatus, EmbeddedResource, EncodeRecipeData, EncodeRecipeErrors, EncodeRecipeRequest, EncodeRecipeResponse, EncodeRecipeResponse2, EncodeRecipeResponses, Envs, ErrorResponse, ExportAppData, ExportAppError, ExportAppErrors, ExportAppResponse, ExportAppResponses, ExportSessionData, ExportSessionErrors, ExportSessionResponse, ExportSessionResponses, ExtensionConfig, ExtensionData, ExtensionEntry, ExtensionLoadResult, ExtensionQuery, ExtensionResponse, ForkRequest, ForkResponse, ForkSessionData, ForkSessionErrors, ForkSessionResponse, ForkSessionResponses, FrontendToolRequest, GetCustomProviderData, GetCustomProviderErrors, GetCustomProviderResponse, GetCustomProviderResponses, GetDictationConfigData, GetDictationConfigResponse, GetDictationConfigResponses, GetExtensionsData, GetExtensionsErrors, GetExtensionsResponse, GetExtensionsResponses, GetPricingData, GetPricingResponse, GetPricingResponses, GetPromptData, GetPromptErrors, GetPromptResponse, GetPromptResponses, GetPromptsData, GetPromptsResponse, GetPromptsResponses, GetProviderModelsData, GetProviderModelsErrors, GetProviderModelsResponse, GetProviderModelsResponses, GetSessionData, GetSessionErrors, GetSessionExtensionsData, GetSessionExtensionsErrors, GetSessionExtensionsResponse, GetSessionExtensionsResponses, GetSessionInsightsData, GetSessionInsightsErrors, GetSessionInsightsResponse, GetSessionInsightsResponses, GetSessionResponse, GetSessionResponses, GetSlashCommandsData, GetSlashCommandsResponse, GetSlashCommandsResponses, GetToolsData, GetToolsErrors, GetToolsQuery, GetToolsResponse, GetToolsResponses, GetTunnelStatusData, GetTunnelStatusResponse, GetTunnelStatusResponses, GooseApp, Icon, ImageContent, ImportAppData, ImportAppError, ImportAppErrors, ImportAppRequest, ImportAppResponse, ImportAppResponse2, ImportAppResponses, ImportSessionData, ImportSessionErrors, ImportSessionRequest, ImportSessionResponse, ImportSessionResponses, InitConfigData, InitConfigErrors, InitConfigResponse, InitConfigResponses, InspectJobResponse, InspectRunningJobData, InspectRunningJobErrors, InspectRunningJobResponse, InspectRunningJobResponses, JsonObject, KillJobResponse, KillRunningJobData, KillRunningJobResponses, ListAppsData, ListAppsError, ListAppsErrors, ListAppsRequest, ListAppsResponse, ListAppsResponse2, ListAppsResponses, ListRecipeResponse, ListRecipesData, ListRecipesErrors, ListRecipesResponse, ListRecipesResponses, ListSchedulesData, ListSchedulesErrors, ListSchedulesResponse, ListSchedulesResponse2, ListSchedulesResponses, ListSessionsData, ListSessionsErrors, ListSessionsResponse, ListSessionsResponses, LoadedProvider, McpAppResource, McpUiProxyData, McpUiProxyErrors, McpUiProxyResponses, Message, MessageContent, MessageEvent, MessageMetadata, ModelConfig, ModelInfo, ParseRecipeData, ParseRecipeError, ParseRecipeErrors, ParseRecipeRequest, ParseRecipeResponse, ParseRecipeResponse2, ParseRecipeResponses, PauseScheduleData, PauseScheduleErrors, PauseScheduleResponse, PauseScheduleResponses, PermissionLevel, PricingData, PricingQuery, PricingResponse, PrincipalType, PromptContentResponse, PromptsListResponse, ProviderDetails, ProviderEngine, ProviderMetadata, ProvidersData, ProvidersResponse, ProvidersResponse2, ProvidersResponses, ProviderType, RawAudioContent, RawEmbeddedResource, RawImageContent, RawResource, RawTextContent, ReadAllConfigData, ReadAllConfigResponse, ReadAllConfigResponses, ReadConfigData, ReadConfigErrors, ReadConfigResponses, ReadResourceData, ReadResourceErrors, ReadResourceRequest, ReadResourceResponse, ReadResourceResponse2, ReadResourceResponses, Recipe, RecipeManifest, RecipeParameter, RecipeParameterInputType, RecipeParameterRequirement, RecipeToYamlData, RecipeToYamlError, RecipeToYamlErrors, RecipeToYamlRequest, RecipeToYamlResponse, RecipeToYamlResponse2, RecipeToYamlResponses, RecoverConfigData, RecoverConfigErrors, RecoverConfigResponse, RecoverConfigResponses, RedactedThinkingContent, RemoveConfigData, RemoveConfigErrors, RemoveConfigResponse, RemoveConfigResponses, RemoveCustomProviderData, RemoveCustomProviderErrors, RemoveCustomProviderResponse, RemoveCustomProviderResponses, RemoveExtensionData, RemoveExtensionErrors, RemoveExtensionRequest, RemoveExtensionResponse, RemoveExtensionResponses, ReplyData, ReplyErrors, ReplyResponse, ReplyResponses, ResetPromptData, ResetPromptErrors, ResetPromptResponse, ResetPromptResponses, ResourceContents, ResourceMetadata, Response, RestartAgentData, RestartAgentErrors, RestartAgentRequest, RestartAgentResponse, RestartAgentResponse2, RestartAgentResponses, ResumeAgentData, ResumeAgentErrors, ResumeAgentRequest, ResumeAgentResponse, ResumeAgentResponse2, ResumeAgentResponses, RetryConfig, Role, RunNowHandlerData, RunNowHandlerErrors, RunNowHandlerResponse, RunNowHandlerResponses, RunNowResponse, SavePromptData, SavePromptErrors, SavePromptRequest, SavePromptResponse, SavePromptResponses, SaveRecipeData, SaveRecipeError, SaveRecipeErrors, SaveRecipeRequest, SaveRecipeResponse, SaveRecipeResponse2, SaveRecipeResponses, ScanRecipeData, ScanRecipeRequest, ScanRecipeResponse, ScanRecipeResponse2, ScanRecipeResponses, ScheduledJob, ScheduleRecipeData, ScheduleRecipeErrors, ScheduleRecipeRequest, ScheduleRecipeResponses, SendTelemetryEventData, SendTelemetryEventResponses, Session, SessionDisplayInfo, SessionExtensionsResponse, SessionInsights, SessionListResponse, SessionsHandlerData, SessionsHandlerErrors, SessionsHandlerResponse, SessionsHandlerResponses, SessionsQuery, SessionType, SetConfigProviderData, SetProviderRequest, SetRecipeSlashCommandData, SetRecipeSlashCommandErrors, SetRecipeSlashCommandResponses, SetSlashCommandRequest, Settings, SetupResponse, SlashCommand, SlashCommandsResponse, StartAgentData, StartAgentError, StartAgentErrors, StartAgentRequest, StartAgentResponse, StartAgentResponses, StartOpenrouterSetupData, StartOpenrouterSetupResponse, StartOpenrouterSetupResponses, StartTetrateSetupData, StartTetrateSetupResponse, StartTetrateSetupResponses, StartTunnelData, StartTunnelError, StartTunnelErrors, StartTunnelResponse, StartTunnelResponses, StatusData, StatusResponse, StatusResponses, StopAgentData, StopAgentErrors, StopAgentRequest, StopAgentResponse, StopAgentResponses, StopTunnelData, StopTunnelError, StopTunnelErrors, StopTunnelResponses, SubRecipe, SuccessCheck, SystemInfo, SystemInfoData, SystemInfoResponse, SystemInfoResponses, SystemNotificationContent, SystemNotificationType, TelemetryEventRequest, Template, TextContent, ThinkingContent, TokenState, Tool, ToolAnnotations, ToolConfirmationRequest, ToolInfo, ToolPermission, ToolRequest, ToolResponse, TranscribeDictationData, TranscribeDictationErrors, TranscribeDictationResponse, TranscribeDictationResponses, TranscribeRequest, TranscribeResponse, TunnelInfo, TunnelState, UiMetadata, UnpauseScheduleData, UnpauseScheduleErrors, UnpauseScheduleResponse, UnpauseScheduleResponses, UpdateAgentProviderData, UpdateAgentProviderErrors, UpdateAgentProviderResponses, UpdateCustomProviderData, UpdateCustomProviderErrors, UpdateCustomProviderRequest, UpdateCustomProviderResponse, UpdateCustomProviderResponses, UpdateFromSessionData, UpdateFromSessionErrors, UpdateFromSessionRequest, UpdateFromSessionResponses, UpdateProviderRequest, UpdateScheduleData, UpdateScheduleErrors, UpdateScheduleRequest, UpdateScheduleResponse, UpdateScheduleResponses, UpdateSessionNameData, UpdateSessionNameErrors, UpdateSessionNameRequest, UpdateSessionNameResponses, UpdateSessionUserRecipeValuesData, UpdateSessionUserRecipeValuesError, UpdateSessionUserRecipeValuesErrors, UpdateSessionUserRecipeValuesRequest, UpdateSessionUserRecipeValuesResponse, UpdateSessionUserRecipeValuesResponse2, UpdateSessionUserRecipeValuesResponses, UpdateWorkingDirData, UpdateWorkingDirErrors, UpdateWorkingDirRequest, UpdateWorkingDirResponses, UpsertConfigData, UpsertConfigErrors, UpsertConfigQuery, UpsertConfigResponse, UpsertConfigResponses, UpsertPermissionsData, UpsertPermissionsErrors, UpsertPermissionsQuery, UpsertPermissionsResponse, UpsertPermissionsResponses, ValidateConfigData, ValidateConfigErrors, ValidateConfigResponse, ValidateConfigResponses, WindowProps } from './types.gen'; diff --git a/ui/desktop/src/api/sdk.gen.ts b/ui/desktop/src/api/sdk.gen.ts index 9ddecc308241..53af9618e058 100644 --- a/ui/desktop/src/api/sdk.gen.ts +++ b/ui/desktop/src/api/sdk.gen.ts @@ -2,7 +2,7 @@ import type { Client, Options as Options2, TDataShape } from './client'; import { client } from './client.gen'; -import type { AddExtensionData, AddExtensionErrors, AddExtensionResponses, AgentAddExtensionData, AgentAddExtensionErrors, AgentAddExtensionResponses, AgentRemoveExtensionData, AgentRemoveExtensionErrors, AgentRemoveExtensionResponses, BackupConfigData, BackupConfigErrors, BackupConfigResponses, CallToolData, CallToolErrors, CallToolResponses, CheckProviderData, ConfigureProviderOauthData, ConfigureProviderOauthErrors, ConfigureProviderOauthResponses, ConfirmToolActionData, ConfirmToolActionErrors, ConfirmToolActionResponses, CreateCustomProviderData, CreateCustomProviderErrors, CreateCustomProviderResponses, CreateRecipeData, CreateRecipeErrors, CreateRecipeResponses, CreateScheduleData, CreateScheduleErrors, CreateScheduleResponses, DecodeRecipeData, DecodeRecipeErrors, DecodeRecipeResponses, DeleteRecipeData, DeleteRecipeErrors, DeleteRecipeResponses, DeleteScheduleData, DeleteScheduleErrors, DeleteScheduleResponses, DeleteSessionData, DeleteSessionErrors, DeleteSessionResponses, DetectProviderData, DetectProviderErrors, DetectProviderResponses, DiagnosticsData, DiagnosticsErrors, DiagnosticsResponses, EncodeRecipeData, EncodeRecipeErrors, EncodeRecipeResponses, ExportAppData, ExportAppErrors, ExportAppResponses, ExportSessionData, ExportSessionErrors, ExportSessionResponses, ForkSessionData, ForkSessionErrors, ForkSessionResponses, GetCustomProviderData, GetCustomProviderErrors, GetCustomProviderResponses, GetExtensionsData, GetExtensionsErrors, GetExtensionsResponses, GetPricingData, GetPricingResponses, GetPromptData, GetPromptErrors, GetPromptResponses, GetPromptsData, GetPromptsResponses, GetProviderModelsData, GetProviderModelsErrors, GetProviderModelsResponses, GetSessionData, GetSessionErrors, GetSessionExtensionsData, GetSessionExtensionsErrors, GetSessionExtensionsResponses, GetSessionInsightsData, GetSessionInsightsErrors, GetSessionInsightsResponses, GetSessionResponses, GetSlashCommandsData, GetSlashCommandsResponses, GetToolsData, GetToolsErrors, GetToolsResponses, GetTunnelStatusData, GetTunnelStatusResponses, ImportAppData, ImportAppErrors, ImportAppResponses, ImportSessionData, ImportSessionErrors, ImportSessionResponses, InitConfigData, InitConfigErrors, InitConfigResponses, InspectRunningJobData, InspectRunningJobErrors, InspectRunningJobResponses, KillRunningJobData, KillRunningJobResponses, ListAppsData, ListAppsErrors, ListAppsResponses, ListRecipesData, ListRecipesErrors, ListRecipesResponses, ListSchedulesData, ListSchedulesErrors, ListSchedulesResponses, ListSessionsData, ListSessionsErrors, ListSessionsResponses, McpUiProxyData, McpUiProxyErrors, McpUiProxyResponses, ParseRecipeData, ParseRecipeErrors, ParseRecipeResponses, PauseScheduleData, PauseScheduleErrors, PauseScheduleResponses, ProvidersData, ProvidersResponses, ReadAllConfigData, ReadAllConfigResponses, ReadConfigData, ReadConfigErrors, ReadConfigResponses, ReadResourceData, ReadResourceErrors, ReadResourceResponses, RecipeToYamlData, RecipeToYamlErrors, RecipeToYamlResponses, RecoverConfigData, RecoverConfigErrors, RecoverConfigResponses, RemoveConfigData, RemoveConfigErrors, RemoveConfigResponses, RemoveCustomProviderData, RemoveCustomProviderErrors, RemoveCustomProviderResponses, RemoveExtensionData, RemoveExtensionErrors, RemoveExtensionResponses, ReplyData, ReplyErrors, ReplyResponses, ResetPromptData, ResetPromptErrors, ResetPromptResponses, RestartAgentData, RestartAgentErrors, RestartAgentResponses, ResumeAgentData, ResumeAgentErrors, ResumeAgentResponses, RunNowHandlerData, RunNowHandlerErrors, RunNowHandlerResponses, SavePromptData, SavePromptErrors, SavePromptResponses, SaveRecipeData, SaveRecipeErrors, SaveRecipeResponses, ScanRecipeData, ScanRecipeResponses, ScheduleRecipeData, ScheduleRecipeErrors, ScheduleRecipeResponses, SendTelemetryEventData, SendTelemetryEventResponses, SessionsHandlerData, SessionsHandlerErrors, SessionsHandlerResponses, SetConfigProviderData, SetRecipeSlashCommandData, SetRecipeSlashCommandErrors, SetRecipeSlashCommandResponses, StartAgentData, StartAgentErrors, StartAgentResponses, StartOpenrouterSetupData, StartOpenrouterSetupResponses, StartTetrateSetupData, StartTetrateSetupResponses, StartTunnelData, StartTunnelErrors, StartTunnelResponses, StatusData, StatusResponses, StopAgentData, StopAgentErrors, StopAgentResponses, StopTunnelData, StopTunnelErrors, StopTunnelResponses, SystemInfoData, SystemInfoResponses, UnpauseScheduleData, UnpauseScheduleErrors, UnpauseScheduleResponses, UpdateAgentProviderData, UpdateAgentProviderErrors, UpdateAgentProviderResponses, UpdateCustomProviderData, UpdateCustomProviderErrors, UpdateCustomProviderResponses, UpdateFromSessionData, UpdateFromSessionErrors, UpdateFromSessionResponses, UpdateScheduleData, UpdateScheduleErrors, UpdateScheduleResponses, UpdateSessionNameData, UpdateSessionNameErrors, UpdateSessionNameResponses, UpdateSessionUserRecipeValuesData, UpdateSessionUserRecipeValuesErrors, UpdateSessionUserRecipeValuesResponses, UpdateWorkingDirData, UpdateWorkingDirErrors, UpdateWorkingDirResponses, UpsertConfigData, UpsertConfigErrors, UpsertConfigResponses, UpsertPermissionsData, UpsertPermissionsErrors, UpsertPermissionsResponses, ValidateConfigData, ValidateConfigErrors, ValidateConfigResponses } from './types.gen'; +import type { AddExtensionData, AddExtensionErrors, AddExtensionResponses, AgentAddExtensionData, AgentAddExtensionErrors, AgentAddExtensionResponses, AgentRemoveExtensionData, AgentRemoveExtensionErrors, AgentRemoveExtensionResponses, BackupConfigData, BackupConfigErrors, BackupConfigResponses, CallToolData, CallToolErrors, CallToolResponses, CheckProviderData, ConfigureProviderOauthData, ConfigureProviderOauthErrors, ConfigureProviderOauthResponses, ConfirmToolActionData, ConfirmToolActionErrors, ConfirmToolActionResponses, CreateCustomProviderData, CreateCustomProviderErrors, CreateCustomProviderResponses, CreateRecipeData, CreateRecipeErrors, CreateRecipeResponses, CreateScheduleData, CreateScheduleErrors, CreateScheduleResponses, DecodeRecipeData, DecodeRecipeErrors, DecodeRecipeResponses, DeleteRecipeData, DeleteRecipeErrors, DeleteRecipeResponses, DeleteScheduleData, DeleteScheduleErrors, DeleteScheduleResponses, DeleteSessionData, DeleteSessionErrors, DeleteSessionResponses, DetectProviderData, DetectProviderErrors, DetectProviderResponses, DiagnosticsData, DiagnosticsErrors, DiagnosticsResponses, EncodeRecipeData, EncodeRecipeErrors, EncodeRecipeResponses, ExportAppData, ExportAppErrors, ExportAppResponses, ExportSessionData, ExportSessionErrors, ExportSessionResponses, ForkSessionData, ForkSessionErrors, ForkSessionResponses, GetCustomProviderData, GetCustomProviderErrors, GetCustomProviderResponses, GetDictationConfigData, GetDictationConfigResponses, GetExtensionsData, GetExtensionsErrors, GetExtensionsResponses, GetPricingData, GetPricingResponses, GetPromptData, GetPromptErrors, GetPromptResponses, GetPromptsData, GetPromptsResponses, GetProviderModelsData, GetProviderModelsErrors, GetProviderModelsResponses, GetSessionData, GetSessionErrors, GetSessionExtensionsData, GetSessionExtensionsErrors, GetSessionExtensionsResponses, GetSessionInsightsData, GetSessionInsightsErrors, GetSessionInsightsResponses, GetSessionResponses, GetSlashCommandsData, GetSlashCommandsResponses, GetToolsData, GetToolsErrors, GetToolsResponses, GetTunnelStatusData, GetTunnelStatusResponses, ImportAppData, ImportAppErrors, ImportAppResponses, ImportSessionData, ImportSessionErrors, ImportSessionResponses, InitConfigData, InitConfigErrors, InitConfigResponses, InspectRunningJobData, InspectRunningJobErrors, InspectRunningJobResponses, KillRunningJobData, KillRunningJobResponses, ListAppsData, ListAppsErrors, ListAppsResponses, ListRecipesData, ListRecipesErrors, ListRecipesResponses, ListSchedulesData, ListSchedulesErrors, ListSchedulesResponses, ListSessionsData, ListSessionsErrors, ListSessionsResponses, McpUiProxyData, McpUiProxyErrors, McpUiProxyResponses, ParseRecipeData, ParseRecipeErrors, ParseRecipeResponses, PauseScheduleData, PauseScheduleErrors, PauseScheduleResponses, ProvidersData, ProvidersResponses, ReadAllConfigData, ReadAllConfigResponses, ReadConfigData, ReadConfigErrors, ReadConfigResponses, ReadResourceData, ReadResourceErrors, ReadResourceResponses, RecipeToYamlData, RecipeToYamlErrors, RecipeToYamlResponses, RecoverConfigData, RecoverConfigErrors, RecoverConfigResponses, RemoveConfigData, RemoveConfigErrors, RemoveConfigResponses, RemoveCustomProviderData, RemoveCustomProviderErrors, RemoveCustomProviderResponses, RemoveExtensionData, RemoveExtensionErrors, RemoveExtensionResponses, ReplyData, ReplyErrors, ReplyResponses, ResetPromptData, ResetPromptErrors, ResetPromptResponses, RestartAgentData, RestartAgentErrors, RestartAgentResponses, ResumeAgentData, ResumeAgentErrors, ResumeAgentResponses, RunNowHandlerData, RunNowHandlerErrors, RunNowHandlerResponses, SavePromptData, SavePromptErrors, SavePromptResponses, SaveRecipeData, SaveRecipeErrors, SaveRecipeResponses, ScanRecipeData, ScanRecipeResponses, ScheduleRecipeData, ScheduleRecipeErrors, ScheduleRecipeResponses, SendTelemetryEventData, SendTelemetryEventResponses, SessionsHandlerData, SessionsHandlerErrors, SessionsHandlerResponses, SetConfigProviderData, SetRecipeSlashCommandData, SetRecipeSlashCommandErrors, SetRecipeSlashCommandResponses, StartAgentData, StartAgentErrors, StartAgentResponses, StartOpenrouterSetupData, StartOpenrouterSetupResponses, StartTetrateSetupData, StartTetrateSetupResponses, StartTunnelData, StartTunnelErrors, StartTunnelResponses, StatusData, StatusResponses, StopAgentData, StopAgentErrors, StopAgentResponses, StopTunnelData, StopTunnelErrors, StopTunnelResponses, SystemInfoData, SystemInfoResponses, TranscribeDictationData, TranscribeDictationErrors, TranscribeDictationResponses, UnpauseScheduleData, UnpauseScheduleErrors, UnpauseScheduleResponses, UpdateAgentProviderData, UpdateAgentProviderErrors, UpdateAgentProviderResponses, UpdateCustomProviderData, UpdateCustomProviderErrors, UpdateCustomProviderResponses, UpdateFromSessionData, UpdateFromSessionErrors, UpdateFromSessionResponses, UpdateScheduleData, UpdateScheduleErrors, UpdateScheduleResponses, UpdateSessionNameData, UpdateSessionNameErrors, UpdateSessionNameResponses, UpdateSessionUserRecipeValuesData, UpdateSessionUserRecipeValuesErrors, UpdateSessionUserRecipeValuesResponses, UpdateWorkingDirData, UpdateWorkingDirErrors, UpdateWorkingDirResponses, UpsertConfigData, UpsertConfigErrors, UpsertConfigResponses, UpsertPermissionsData, UpsertPermissionsErrors, UpsertPermissionsResponses, ValidateConfigData, ValidateConfigErrors, ValidateConfigResponses } from './types.gen'; export type Options = Options2 & { /** @@ -283,6 +283,17 @@ export const validateConfig = (options?: O export const diagnostics = (options: Options) => (options.client ?? client).get({ url: '/diagnostics/{session_id}', ...options }); +export const getDictationConfig = (options?: Options) => (options?.client ?? client).get({ url: '/dictation/config', ...options }); + +export const transcribeDictation = (options: Options) => (options.client ?? client).post({ + url: '/dictation/transcribe', + ...options, + headers: { + 'Content-Type': 'application/json', + ...options.headers + } +}); + export const startOpenrouterSetup = (options?: Options) => (options?.client ?? client).post({ url: '/handle_openrouter', ...options }); export const startTetrateSetup = (options?: Options) => (options?.client ?? client).post({ url: '/handle_tetrate', ...options }); diff --git a/ui/desktop/src/api/types.gen.ts b/ui/desktop/src/api/types.gen.ts index 2bc51d6010af..08ff21ec1319 100644 --- a/ui/desktop/src/api/types.gen.ts +++ b/ui/desktop/src/api/types.gen.ts @@ -190,6 +190,35 @@ export type DetectProviderResponse = { provider_name: string; }; +export type DictationProvider = 'openai' | 'elevenlabs'; + +export type DictationProviderStatus = { + /** + * Config key name if uses_provider_config is false + */ + config_key?: string | null; + /** + * Whether the provider is fully configured and ready to use + */ + configured: boolean; + /** + * Description of what this provider does + */ + description: string; + /** + * Custom host URL if configured (only for providers that support it) + */ + host?: string | null; + /** + * Path to settings if uses_provider_config is true + */ + settings_path?: string | null; + /** + * Whether this provider uses the main provider config (true) or has its own key (false) + */ + uses_provider_config: boolean; +}; + export type EmbeddedResource = { _meta?: { [key: string]: unknown; @@ -1173,6 +1202,25 @@ export type ToolResponse = { }; }; +export type TranscribeRequest = { + /** + * Base64 encoded audio data + */ + audio: string; + /** + * MIME type of the audio (e.g., "audio/webm", "audio/wav") + */ + mime_type: string; + provider: DictationProvider; +}; + +export type TranscribeResponse = { + /** + * Transcribed text from the audio + */ + text: string; +}; + export type TunnelInfo = { hostname: string; secret: string; @@ -2456,6 +2504,79 @@ export type DiagnosticsResponses = { export type DiagnosticsResponse = DiagnosticsResponses[keyof DiagnosticsResponses]; +export type GetDictationConfigData = { + body?: never; + path?: never; + query?: never; + url: '/dictation/config'; +}; + +export type GetDictationConfigResponses = { + /** + * Audio transcription provider configurations + */ + 200: { + [key: string]: DictationProviderStatus; + }; +}; + +export type GetDictationConfigResponse = GetDictationConfigResponses[keyof GetDictationConfigResponses]; + +export type TranscribeDictationData = { + body: TranscribeRequest; + path?: never; + query?: never; + url: '/dictation/transcribe'; +}; + +export type TranscribeDictationErrors = { + /** + * Invalid request (bad base64 or unsupported format) + */ + 400: unknown; + /** + * Invalid API key + */ + 401: unknown; + /** + * DictationProvider not configured + */ + 412: unknown; + /** + * Audio file too large (max 25MB) + */ + 413: unknown; + /** + * Rate limit exceeded + */ + 429: unknown; + /** + * Internal server error + */ + 500: unknown; + /** + * DictationProvider API error + */ + 502: unknown; + /** + * Service unavailable + */ + 503: unknown; + /** + * Request timeout + */ + 504: unknown; +}; + +export type TranscribeDictationResponses = { + /** + * Audio transcribed successfully + */ + 200: TranscribeResponse; +}; + +export type TranscribeDictationResponse = TranscribeDictationResponses[keyof TranscribeDictationResponses]; + export type StartOpenrouterSetupData = { body?: never; path?: never; diff --git a/ui/desktop/src/components/ChatInput.tsx b/ui/desktop/src/components/ChatInput.tsx index 638d362779dc..17bded35f421 100644 --- a/ui/desktop/src/components/ChatInput.tsx +++ b/ui/desktop/src/components/ChatInput.tsx @@ -16,12 +16,10 @@ import { BottomMenuExtensionSelection } from './bottom_menu/BottomMenuExtensionS import { AlertType, useAlerts } from './alerts'; import { useConfig } from './ConfigContext'; import { useModelAndProvider } from './ModelAndProviderContext'; -import { useWhisper } from '../hooks/useWhisper'; -import { DICTATION_PROVIDER_ELEVENLABS } from '../hooks/dictationConstants'; -import { WaveformVisualizer } from './WaveformVisualizer'; +import { useAudioRecorder } from '../hooks/useAudioRecorder'; import { toastError } from '../toasts'; import MentionPopover, { DisplayItemWithMatch } from './MentionPopover'; -import { COST_TRACKING_ENABLED, VOICE_DICTATION_ELEVENLABS_ENABLED } from '../updates'; +import { COST_TRACKING_ENABLED } from '../updates'; import { CostTracker } from './bottom_menu/CostTracker'; import { DroppedFile, useFileDrop } from '../hooks/useFileDrop'; import { Recipe } from '../recipe'; @@ -254,19 +252,17 @@ export default function ChatInput({ selectFile: (index: number) => void; }>(null); - // Whisper hook for voice dictation + // Audio recorder hook for voice dictation const { + isEnabled, + dictationProvider, isRecording, isTranscribing, - canUseDictation, - audioContext, - analyser, startRecording, stopRecording, recordingDuration, estimatedSize, - dictationSettings, - } = useWhisper({ + } = useAudioRecorder({ onTranscription: (text) => { trackVoiceDictation('transcribed'); // Append transcribed text to the current input @@ -275,18 +271,12 @@ export default function ChatInput({ setValue(newValue); textAreaRef.current?.focus(); }, - onError: (error) => { - const errorType = error.name || 'DictationError'; + onError: (message) => { + const errorType = 'DictationError'; trackVoiceDictation('error', undefined, errorType); toastError({ title: 'Dictation Error', - msg: error.message, - }); - }, - onSizeWarning: (sizeMB) => { - toastError({ - title: 'Recording Size Warning', - msg: `Recording is ${sizeMB.toFixed(1)}MB. Maximum size is 25MB.`, + msg: message, }); }, }); @@ -1236,26 +1226,25 @@ export default function ChatInput({ maxHeight: `${maxHeight}px`, overflowY: 'auto', opacity: isRecording ? 0 : 1, - paddingRight: dictationSettings?.enabled ? '180px' : '120px', + paddingRight: dictationProvider ? '180px' : '120px', }} className="w-full outline-none border-none focus:ring-0 bg-transparent px-3 pt-3 pb-1.5 text-sm resize-none text-textStandard placeholder:text-textPlaceholder" /> {isRecording && (
- +
+ + Recording... +
)} {/* Inline action buttons - absolutely positioned on the right */}
- {/* Microphone button - show only if dictation is enabled */} - {dictationSettings?.enabled && ( + {/* Microphone button - show only if provider is selected */} + {dictationProvider && ( <> - {!canUseDictation ? ( + {!isEnabled ? ( @@ -1273,22 +1262,16 @@ export default function ChatInput({ - {dictationSettings.provider === 'openai' ? ( + {dictationProvider === 'openai' ? (

OpenAI API key is not configured. Set it up in Settings {'>'}{' '} Models.

- ) : VOICE_DICTATION_ELEVENLABS_ENABLED && - dictationSettings.provider === DICTATION_PROVIDER_ELEVENLABS ? ( + ) : dictationProvider === 'elevenlabs' ? (

ElevenLabs API key is not configured. Set it up in Settings {'>'}{' '} Chat {'>'} Voice Dictation.

- ) : dictationSettings.provider === null ? ( -

- Dictation is not configured. Configure it in Settings {'>'}{' '} - Chat {'>'} Voice Dictation. -

) : (

Dictation provider is not properly configured.

)} diff --git a/ui/desktop/src/components/settings/chat/ChatSettingsSection.tsx b/ui/desktop/src/components/settings/chat/ChatSettingsSection.tsx index fc7ae5acd61e..ab630ecd96cf 100644 --- a/ui/desktop/src/components/settings/chat/ChatSettingsSection.tsx +++ b/ui/desktop/src/components/settings/chat/ChatSettingsSection.tsx @@ -1,5 +1,5 @@ import { ModeSection } from '../mode/ModeSection'; -import DictationSection from '../dictation/DictationSection'; +import { DictationSettings } from '../dictation/DictationSettings'; import { SecurityToggle } from '../security/SecurityToggle'; import { ResponseStylesSection } from '../response_styles/ResponseStylesSection'; import { GoosehintsSection } from './GoosehintsSection'; @@ -21,30 +21,30 @@ export default function ChatSettingsSection() { - + - - Response Styles - Choose how Goose should format and style its responses - - + + + + Response Styles + Choose how Goose should format and style its responses + - - + - +
diff --git a/ui/desktop/src/components/settings/dictation/DictationSection.tsx b/ui/desktop/src/components/settings/dictation/DictationSection.tsx deleted file mode 100644 index a4abcfcc087d..000000000000 --- a/ui/desktop/src/components/settings/dictation/DictationSection.tsx +++ /dev/null @@ -1,5 +0,0 @@ -import { VoiceDictationToggle } from './VoiceDictationToggle'; - -export default function DictationSection() { - return ; -} diff --git a/ui/desktop/src/components/settings/dictation/DictationSettings.tsx b/ui/desktop/src/components/settings/dictation/DictationSettings.tsx new file mode 100644 index 000000000000..39774cca20da --- /dev/null +++ b/ui/desktop/src/components/settings/dictation/DictationSettings.tsx @@ -0,0 +1,235 @@ +import { useState, useEffect } from 'react'; +import { ChevronDown } from 'lucide-react'; +import { DictationProvider, getDictationConfig, DictationProviderStatus } from '../../../api'; +import { useConfig } from '../../ConfigContext'; +import { Input } from '../../ui/input'; +import { Button } from '../../ui/button'; +import { trackSettingToggled } from '../../../utils/analytics'; + +export const DictationSettings = () => { + const [provider, setProvider] = useState(null); + const [showProviderDropdown, setShowProviderDropdown] = useState(false); + const [providerStatuses, setProviderStatuses] = useState>( + {} + ); + const [apiKey, setApiKey] = useState(''); + const [isEditingKey, setIsEditingKey] = useState(false); + const [keyValidationError, setKeyValidationError] = useState(''); + const { read, upsert, remove } = useConfig(); + + useEffect(() => { + const loadSettings = async () => { + const providerValue = await read('voice_dictation_provider', false); + const loadedProvider: DictationProvider | null = (providerValue as DictationProvider) || null; + setProvider(loadedProvider); + + const audioConfig = await getDictationConfig(); + setProviderStatuses(audioConfig.data || {}); + }; + + loadSettings(); + }, [read]); + + const saveProvider = async (newProvider: DictationProvider | null) => { + console.log('Saving dictation provider to backend config:', newProvider); + setProvider(newProvider); + await upsert('voice_dictation_provider', newProvider || '', false); + trackSettingToggled('voice_dictation', newProvider !== null); + }; + + const handleProviderChange = (newProvider: DictationProvider | null) => { + saveProvider(newProvider); + setShowProviderDropdown(false); + }; + + const handleDropdownToggle = async () => { + const newShowState = !showProviderDropdown; + setShowProviderDropdown(newShowState); + + if (newShowState) { + const audioConfig = await getDictationConfig(); + setProviderStatuses(audioConfig.data || {}); + } + }; + + const handleSaveKey = async () => { + if (!provider) return; + const providerConfig = providerStatuses[provider]; + if (!providerConfig || providerConfig.uses_provider_config) return; + + const trimmedKey = apiKey.trim(); + if (!trimmedKey) { + setKeyValidationError('API key is required'); + return; + } + + try { + const keyName = providerConfig.config_key!; + await upsert(keyName, trimmedKey, true); + setApiKey(''); + setKeyValidationError(''); + setIsEditingKey(false); + + const audioConfig = await getDictationConfig(); + setProviderStatuses(audioConfig.data || {}); + } catch (error) { + console.error('Error saving API key:', error); + setKeyValidationError('Failed to save API key'); + } + }; + + const handleRemoveKey = async () => { + if (!provider) return; + const providerConfig = providerStatuses[provider]; + if (!providerConfig || providerConfig.uses_provider_config) return; + + try { + const keyName = providerConfig.config_key!; + await remove(keyName, true); + setApiKey(''); + setKeyValidationError(''); + setIsEditingKey(false); + + const audioConfig = await getDictationConfig(); + setProviderStatuses(audioConfig.data || {}); + } catch (error) { + console.error('Error removing API key:', error); + setKeyValidationError('Failed to remove API key'); + } + }; + + const handleCancelEdit = () => { + setApiKey(''); + setKeyValidationError(''); + setIsEditingKey(false); + }; + + const getProviderLabel = (provider: DictationProvider | null): string => { + if (!provider) return 'Disabled'; + return provider.charAt(0).toUpperCase() + provider.slice(1); + }; + + return ( +
+
+
+

Voice Dictation Provider

+

+ Choose how voice is converted to text +

+
+
+ + + {showProviderDropdown && ( +
+ + + {(Object.keys(providerStatuses) as DictationProvider[]).map((p) => ( + + ))} +
+ )} +
+
+ + {provider && providerStatuses[provider] && ( + <> +
+

{providerStatuses[provider].description}

+
+ + {providerStatuses[provider].uses_provider_config ? ( +
+ {!providerStatuses[provider].configured ? ( +

+ Configure the API key in {providerStatuses[provider].settings_path} +

+ ) : ( +

+ ✓ Configured in {providerStatuses[provider].settings_path} +

+ )} +
+ ) : ( +
+
+

API Key

+

+ Required for transcription + {providerStatuses[provider]?.configured && ( + (Configured) + )} +

+
+ + {!isEditingKey ? ( + + ) : ( +
+ { + setApiKey(e.target.value); + if (keyValidationError) setKeyValidationError(''); + }} + placeholder="Enter your API key" + className="max-w-md" + autoFocus + /> + {keyValidationError && ( +

{keyValidationError}

+ )} +
+ + + {providerStatuses[provider]?.configured && ( + + )} +
+
+ )} +
+ )} + + )} +
+ ); +}; diff --git a/ui/desktop/src/components/settings/dictation/ElevenLabsKeyInput.tsx b/ui/desktop/src/components/settings/dictation/ElevenLabsKeyInput.tsx deleted file mode 100644 index 1856a8c5e5c5..000000000000 --- a/ui/desktop/src/components/settings/dictation/ElevenLabsKeyInput.tsx +++ /dev/null @@ -1,128 +0,0 @@ -import { useState, useEffect, useCallback } from 'react'; -import { Input } from '../../ui/input'; -import { Button } from '../../ui/button'; -import { useConfig } from '../../ConfigContext'; -import { ELEVENLABS_API_KEY, isSecretKeyConfigured } from '../../../hooks/dictationConstants'; -import { setElevenLabsKeyCache } from '../../../hooks/useDictationSettings'; - -export const ElevenLabsKeyInput = () => { - const [elevenLabsApiKey, setElevenLabsApiKey] = useState(''); - const [isLoadingKey, setIsLoadingKey] = useState(false); - const [hasElevenLabsKey, setHasElevenLabsKey] = useState(false); - const [validationError, setValidationError] = useState(''); - const [isEditing, setIsEditing] = useState(false); - const { upsert, read, remove } = useConfig(); - - const loadKey = useCallback(async () => { - setIsLoadingKey(true); - try { - const response = await read(ELEVENLABS_API_KEY, true); - const hasKey = isSecretKeyConfigured(response); - setHasElevenLabsKey(hasKey); - setElevenLabsKeyCache(hasKey); - } catch (error) { - console.error(error); - setElevenLabsKeyCache(false); - } finally { - setIsLoadingKey(false); - } - }, [read]); - - useEffect(() => { - loadKey(); - }, [loadKey]); - - const handleElevenLabsKeyChange = (key: string) => { - setElevenLabsApiKey(key); - if (validationError) { - setValidationError(''); - } - }; - - const handleSave = async () => { - try { - const trimmedKey = elevenLabsApiKey.trim(); - - if (!trimmedKey) { - setValidationError('API key is required'); - return; - } - - await upsert(ELEVENLABS_API_KEY, trimmedKey, true); - setElevenLabsApiKey(''); - setValidationError(''); - setIsEditing(false); - await loadKey(); - } catch (error) { - console.error(error); - setValidationError('Failed to save API key'); - } - }; - - const handleRemove = async () => { - try { - await remove(ELEVENLABS_API_KEY, true); - await loadKey(); - setElevenLabsApiKey(''); - setValidationError(''); - setIsEditing(false); - } catch (error) { - console.error(error); - setValidationError('Failed to remove API key'); - } - }; - - const handleCancel = () => { - setElevenLabsApiKey(''); - setValidationError(''); - setIsEditing(false); - }; - - return ( -
-
-

ElevenLabs API Key

-

- Required for ElevenLabs voice recognition - {hasElevenLabsKey && (Configured)} -

-
- - {!isEditing ? ( - - ) : ( -
- handleElevenLabsKeyChange(e.target.value)} - placeholder="Enter your ElevenLabs API key" - className="max-w-md" - autoFocus - /> - {validationError &&

{validationError}

} -
- - - {hasElevenLabsKey && ( - - )} -
-
- )} -
- ); -}; diff --git a/ui/desktop/src/components/settings/dictation/ProviderInfo.tsx b/ui/desktop/src/components/settings/dictation/ProviderInfo.tsx deleted file mode 100644 index 187f790d1f2f..000000000000 --- a/ui/desktop/src/components/settings/dictation/ProviderInfo.tsx +++ /dev/null @@ -1,41 +0,0 @@ -import { DictationProvider } from '../../../hooks/useDictationSettings'; -import { DICTATION_PROVIDER_ELEVENLABS } from '../../../hooks/dictationConstants'; -import { VOICE_DICTATION_ELEVENLABS_ENABLED } from '../../../updates'; - -interface ProviderInfoProps { - provider: DictationProvider; -} - -export const ProviderInfo = ({ provider }: ProviderInfoProps) => { - if (!provider) return null; - - return ( -
- {provider === 'openai' && ( -

- Uses OpenAI's Whisper API for high-quality transcription. Requires an OpenAI API key - configured in the Models section. -

- )} - {VOICE_DICTATION_ELEVENLABS_ENABLED && provider === DICTATION_PROVIDER_ELEVENLABS && ( -
-

- Uses ElevenLabs speech-to-text API for high-quality transcription. -

-

- Features: -

-
    -
  • Advanced voice processing
  • -
  • High accuracy transcription
  • -
  • Multiple language support
  • -
  • Fast processing
  • -
-

- Note: Requires an ElevenLabs API key with speech-to-text access. -

-
- )} -
- ); -}; diff --git a/ui/desktop/src/components/settings/dictation/ProviderSelector.tsx b/ui/desktop/src/components/settings/dictation/ProviderSelector.tsx deleted file mode 100644 index 4b3878509a06..000000000000 --- a/ui/desktop/src/components/settings/dictation/ProviderSelector.tsx +++ /dev/null @@ -1,128 +0,0 @@ -import { useState, useEffect } from 'react'; -import { ChevronDown } from 'lucide-react'; -import { DictationProvider, DictationSettings } from '../../../hooks/useDictationSettings'; -import { - DICTATION_PROVIDER_OPENAI, - DICTATION_PROVIDER_ELEVENLABS, -} from '../../../hooks/dictationConstants'; -import { useConfig } from '../../ConfigContext'; -import { ElevenLabsKeyInput } from './ElevenLabsKeyInput'; -import { ProviderInfo } from './ProviderInfo'; -import { VOICE_DICTATION_ELEVENLABS_ENABLED } from '../../../updates'; - -interface ProviderSelectorProps { - settings: DictationSettings; - onProviderChange: (provider: DictationProvider) => void; -} - -export const ProviderSelector = ({ settings, onProviderChange }: ProviderSelectorProps) => { - const [hasOpenAIKey, setHasOpenAIKey] = useState(false); - const [showProviderDropdown, setShowProviderDropdown] = useState(false); - const { getProviders } = useConfig(); - - useEffect(() => { - const checkOpenAIKey = async () => { - try { - const providers = await getProviders(false); - const openAIProvider = providers.find((p) => p.name === 'openai'); - setHasOpenAIKey(openAIProvider?.is_configured || false); - } catch (error) { - console.error('Error checking OpenAI configuration:', error); - setHasOpenAIKey(false); - } - }; - - checkOpenAIKey(); - }, [getProviders]); - - const handleDropdownToggle = async () => { - const newShowState = !showProviderDropdown; - setShowProviderDropdown(newShowState); - - if (newShowState) { - try { - const providers = await getProviders(true); - const openAIProvider = providers.find((p) => p.name === 'openai'); - const isConfigured = !!openAIProvider?.is_configured; - setHasOpenAIKey(isConfigured); - } catch (error) { - console.error('Error checking OpenAI configuration:', error); - setHasOpenAIKey(false); - } - } - }; - - const handleProviderChange = (provider: DictationProvider) => { - onProviderChange(provider); - setShowProviderDropdown(false); - }; - - const getProviderLabel = (provider: DictationProvider): string => { - switch (provider) { - case DICTATION_PROVIDER_OPENAI: - return 'OpenAI Whisper'; - case DICTATION_PROVIDER_ELEVENLABS: - return 'ElevenLabs'; - default: - return 'None (disabled)'; - } - }; - - return ( -
-
-
-

Dictation Provider

-

- Choose how voice is converted to text -

-
-
- - - {showProviderDropdown && ( -
- - - {VOICE_DICTATION_ELEVENLABS_ENABLED && ( - - )} -
- )} -
-
- - {VOICE_DICTATION_ELEVENLABS_ENABLED && - settings.provider === DICTATION_PROVIDER_ELEVENLABS && } - - -
- ); -}; diff --git a/ui/desktop/src/components/settings/dictation/VoiceDictationToggle.tsx b/ui/desktop/src/components/settings/dictation/VoiceDictationToggle.tsx deleted file mode 100644 index 6af1c71c8916..000000000000 --- a/ui/desktop/src/components/settings/dictation/VoiceDictationToggle.tsx +++ /dev/null @@ -1,97 +0,0 @@ -import { useState, useEffect } from 'react'; -import { Switch } from '../../ui/switch'; -import { DictationProvider, DictationSettings } from '../../../hooks/useDictationSettings'; -import { - DICTATION_SETTINGS_KEY, - DICTATION_PROVIDER_OPENAI, - DICTATION_PROVIDER_ELEVENLABS, - getDefaultDictationSettings, -} from '../../../hooks/dictationConstants'; -import { useConfig } from '../../ConfigContext'; -import { ProviderSelector } from './ProviderSelector'; -import { VOICE_DICTATION_ELEVENLABS_ENABLED } from '../../../updates'; -import { trackSettingToggled } from '../../../utils/analytics'; - -export const VoiceDictationToggle = () => { - const [settings, setSettings] = useState({ - enabled: false, - provider: null, - }); - const { getProviders } = useConfig(); - - useEffect(() => { - const loadSettings = async () => { - const savedSettings = localStorage.getItem(DICTATION_SETTINGS_KEY); - - let loadedSettings: DictationSettings; - - if (savedSettings) { - const parsed = JSON.parse(savedSettings); - loadedSettings = parsed; - - // If ElevenLabs is disabled and user has it selected, reset to OpenAI - if ( - !VOICE_DICTATION_ELEVENLABS_ENABLED && - loadedSettings.provider === DICTATION_PROVIDER_ELEVENLABS - ) { - loadedSettings = { - ...loadedSettings, - provider: DICTATION_PROVIDER_OPENAI, - }; - localStorage.setItem(DICTATION_SETTINGS_KEY, JSON.stringify(loadedSettings)); - } - } else { - loadedSettings = await getDefaultDictationSettings(getProviders); - } - - setSettings(loadedSettings); - }; - - loadSettings(); - }, [getProviders]); - - const saveSettings = (newSettings: DictationSettings) => { - console.log('Saving dictation settings to localStorage:', newSettings); - setSettings(newSettings); - localStorage.setItem(DICTATION_SETTINGS_KEY, JSON.stringify(newSettings)); - }; - - const handleToggle = (enabled: boolean) => { - saveSettings({ - ...settings, - enabled, - provider: settings.provider === null ? DICTATION_PROVIDER_OPENAI : settings.provider, - }); - trackSettingToggled('voice_dictation', enabled); - }; - - const handleProviderChange = (provider: DictationProvider) => { - saveSettings({ ...settings, provider }); - }; - - return ( -
-
-
-

Enable Voice Dictation

-

- Show microphone button for voice input -

-
-
- -
-
- -
-
- -
-
-
- ); -}; diff --git a/ui/desktop/src/hooks/dictationConstants.ts b/ui/desktop/src/hooks/dictationConstants.ts deleted file mode 100644 index 1863c6decace..000000000000 --- a/ui/desktop/src/hooks/dictationConstants.ts +++ /dev/null @@ -1,31 +0,0 @@ -import { DictationSettings, DictationProvider } from './useDictationSettings'; - -export const DICTATION_SETTINGS_KEY = 'dictation_settings'; -export const ELEVENLABS_API_KEY = 'ELEVENLABS_API_KEY'; -export const DICTATION_PROVIDER_OPENAI = 'openai' as const; -export const DICTATION_PROVIDER_ELEVENLABS = 'elevenlabs' as const; - -export const isSecretKeyConfigured = (response: unknown): boolean => - typeof response === 'object' && - response !== null && - 'maskedValue' in response && - !!(response as { maskedValue: string }).maskedValue; - -export const getDefaultDictationSettings = async ( - getProviders: (refresh: boolean) => Promise> -): Promise => { - const providers = await getProviders(false); - const openAIProvider = providers.find((p) => p.name === 'openai'); - - if (openAIProvider && openAIProvider.is_configured) { - return { - enabled: true, - provider: DICTATION_PROVIDER_OPENAI, - }; - } else { - return { - enabled: false, - provider: null as DictationProvider, - }; - } -}; diff --git a/ui/desktop/src/hooks/useAudioRecorder.ts b/ui/desktop/src/hooks/useAudioRecorder.ts new file mode 100644 index 000000000000..cfdc43f20678 --- /dev/null +++ b/ui/desktop/src/hooks/useAudioRecorder.ts @@ -0,0 +1,219 @@ +import { useState, useRef, useCallback, useEffect } from 'react'; +import { transcribeDictation, getDictationConfig, DictationProvider } from '../api'; +import { useConfig } from '../components/ConfigContext'; +import { errorMessage } from '../utils/conversionUtils'; + +interface UseAudioRecorderOptions { + onTranscription: (text: string) => void; + onError: (message: string) => void; +} + +const MAX_AUDIO_SIZE_MB = 25; +const MAX_RECORDING_DURATION_SECONDS = 10 * 60; + +export const useAudioRecorder = ({ onTranscription, onError }: UseAudioRecorderOptions) => { + const [isRecording, setIsRecording] = useState(false); + const [isTranscribing, setIsTranscribing] = useState(false); + const [recordingDuration, setRecordingDuration] = useState(0); + const [estimatedSize, setEstimatedSize] = useState(0); + const [isEnabled, setIsEnabled] = useState(false); + const [provider, setProvider] = useState(null); + + const { read } = useConfig(); + + const mediaRecorderRef = useRef(null); + const audioChunksRef = useRef([]); + const streamRef = useRef(null); + const durationIntervalRef = useRef | null>(null); + + useEffect(() => { + const checkProviderConfig = async () => { + try { + const providerValue = await read('voice_dictation_provider', false); + const preferredProvider = (providerValue as DictationProvider) || null; + + if (!preferredProvider) { + setIsEnabled(false); + setProvider(null); + return; + } + + const audioConfigResponse = await getDictationConfig(); + const providerStatus = audioConfigResponse.data?.[preferredProvider]; + + setIsEnabled(!!providerStatus?.configured); + setProvider(preferredProvider); + } catch (error) { + console.error('Error checking audio config:', error); + setIsEnabled(false); + setProvider(null); + } + }; + + checkProviderConfig(); + }, [read]); + + const stopRecording = useCallback(() => { + setIsRecording(false); + + if (mediaRecorderRef.current && mediaRecorderRef.current.state !== 'inactive') { + mediaRecorderRef.current.stop(); + } + + if (durationIntervalRef.current) { + clearInterval(durationIntervalRef.current); + durationIntervalRef.current = null; + } + + if (streamRef.current) { + streamRef.current.getTracks().forEach((track) => track.stop()); + streamRef.current = null; + } + }, []); + + useEffect(() => { + return () => { + if (durationIntervalRef.current) { + clearInterval(durationIntervalRef.current); + } + if (streamRef.current) { + streamRef.current.getTracks().forEach((track) => track.stop()); + } + }; + }, []); + + const transcribeAudio = useCallback( + async (audioBlob: Blob) => { + if (!provider) { + onError('No transcription provider configured'); + return; + } + + setIsTranscribing(true); + + try { + const sizeMB = audioBlob.size / (1024 * 1024); + if (sizeMB > MAX_AUDIO_SIZE_MB) { + onError( + `Audio file too large (${sizeMB.toFixed(1)}MB). Maximum size is ${MAX_AUDIO_SIZE_MB}MB.` + ); + return; + } + + const reader = new FileReader(); + const base64Audio = await new Promise((resolve, reject) => { + reader.onloadend = () => { + const base64 = reader.result as string; + resolve(base64.split(',')[1]); + }; + reader.onerror = reject; + reader.readAsDataURL(audioBlob); + }); + + const mimeType = audioBlob.type; + if (!mimeType) { + throw new Error('Unable to determine audio format'); + } + + const result = await transcribeDictation({ + body: { + audio: base64Audio, + mime_type: mimeType, + provider: provider, + }, + throwOnError: true, + }); + + if (result.data?.text) { + onTranscription(result.data.text); + } + } catch (error) { + onError(errorMessage(error)); + } finally { + setIsTranscribing(false); + setRecordingDuration(0); + setEstimatedSize(0); + } + }, + [provider, onTranscription, onError] + ); + + const startRecording = useCallback(async () => { + if (!isEnabled) { + onError('Voice dictation is not enabled'); + return; + } + + try { + const stream = await navigator.mediaDevices.getUserMedia({ + audio: { + echoCancellation: true, + noiseSuppression: true, + autoGainControl: true, + }, + }); + streamRef.current = stream; + + const supportedTypes = ['audio/webm;codecs=opus', 'audio/webm', 'audio/mp4', 'audio/wav']; + const mimeType = supportedTypes.find((type) => MediaRecorder.isTypeSupported(type)) || ''; + + const mediaRecorder = new MediaRecorder(stream, mimeType ? { mimeType } : {}); + mediaRecorderRef.current = mediaRecorder; + audioChunksRef.current = []; + + const startTime = Date.now(); + durationIntervalRef.current = setInterval(() => { + const elapsed = (Date.now() - startTime) / 1000; + setRecordingDuration(elapsed); + + const estimatedSizeMB = (elapsed * 128 * 1024) / (8 * 1024 * 1024); + setEstimatedSize(estimatedSizeMB); + + if (elapsed >= MAX_RECORDING_DURATION_SECONDS) { + stopRecording(); + onError( + `Maximum recording duration (${MAX_RECORDING_DURATION_SECONDS / 60} minutes) reached` + ); + } + }, 100); + + mediaRecorder.ondataavailable = (event) => { + if (event.data.size > 0) { + audioChunksRef.current.push(event.data); + } + }; + + mediaRecorder.onstop = async () => { + const audioBlob = new Blob(audioChunksRef.current, { type: mimeType || 'audio/webm' }); + + if (audioBlob.size === 0) { + onError('No audio data was recorded. Please check your microphone.'); + return; + } + + await transcribeAudio(audioBlob); + }; + + mediaRecorder.onerror = (_event) => { + onError('Recording failed'); + }; + + mediaRecorder.start(100); + setIsRecording(true); + } catch (error) { + stopRecording(); + onError(errorMessage(error)); + } + }, [isEnabled, onError, transcribeAudio, stopRecording]); + + return { + isEnabled, + dictationProvider: provider, + isRecording, + isTranscribing, + recordingDuration, + estimatedSize, + startRecording, + stopRecording, + }; +}; diff --git a/ui/desktop/src/hooks/useDictationSettings.ts b/ui/desktop/src/hooks/useDictationSettings.ts deleted file mode 100644 index 4afe4e0f2574..000000000000 --- a/ui/desktop/src/hooks/useDictationSettings.ts +++ /dev/null @@ -1,76 +0,0 @@ -import { useState, useEffect } from 'react'; -import { useConfig } from '../components/ConfigContext'; -import { - DICTATION_SETTINGS_KEY, - ELEVENLABS_API_KEY, - DICTATION_PROVIDER_OPENAI, - DICTATION_PROVIDER_ELEVENLABS, - getDefaultDictationSettings, - isSecretKeyConfigured, -} from './dictationConstants'; - -export type DictationProvider = - | typeof DICTATION_PROVIDER_OPENAI - | typeof DICTATION_PROVIDER_ELEVENLABS - | null; - -export interface DictationSettings { - enabled: boolean; - provider: DictationProvider; -} - -let elevenLabsKeyCache: boolean | null = null; - -export const setElevenLabsKeyCache = (value: boolean) => { - elevenLabsKeyCache = value; -}; - -export const useDictationSettings = () => { - const [settings, setSettings] = useState(null); - const [hasElevenLabsKey, setHasElevenLabsKey] = useState(elevenLabsKeyCache ?? false); - const { read, getProviders } = useConfig(); - - useEffect(() => { - const loadSettings = async () => { - const saved = localStorage.getItem(DICTATION_SETTINGS_KEY); - - let currentSettings: DictationSettings; - if (saved) { - currentSettings = JSON.parse(saved); - } else { - currentSettings = await getDefaultDictationSettings(getProviders); - } - setSettings(currentSettings); - if ( - currentSettings.provider === DICTATION_PROVIDER_ELEVENLABS && - elevenLabsKeyCache === null - ) { - try { - const response = await read(ELEVENLABS_API_KEY, true); - const hasKey = isSecretKeyConfigured(response); - elevenLabsKeyCache = hasKey; - setHasElevenLabsKey(hasKey); - } catch (error) { - elevenLabsKeyCache = false; - setHasElevenLabsKey(false); - console.error('[useDictationSettings] Error checking ElevenLabs API key:', error); - } - } - }; - - loadSettings(); - - // Listen for storage changes from other tabs/windows - // eslint-disable-next-line @typescript-eslint/no-explicit-any - const handleStorageChange = (e: any) => { - if (e.key === DICTATION_SETTINGS_KEY && e.newValue) { - setSettings(JSON.parse(e.newValue)); - } - }; - - window.addEventListener('storage', handleStorageChange); - return () => window.removeEventListener('storage', handleStorageChange); - }, [read, getProviders]); - - return { settings, hasElevenLabsKey }; -}; diff --git a/ui/desktop/src/hooks/useWhisper.ts b/ui/desktop/src/hooks/useWhisper.ts deleted file mode 100644 index 09c712c38aa5..000000000000 --- a/ui/desktop/src/hooks/useWhisper.ts +++ /dev/null @@ -1,378 +0,0 @@ -import { useState, useRef, useCallback, useEffect } from 'react'; -import { useConfig } from '../components/ConfigContext'; -import { getApiUrl } from '../config'; -import { useDictationSettings } from './useDictationSettings'; -import { DICTATION_PROVIDER_OPENAI, DICTATION_PROVIDER_ELEVENLABS } from './dictationConstants'; -import { safeJsonParse, errorMessage } from '../utils/conversionUtils'; - -interface UseWhisperOptions { - onTranscription?: (text: string) => void; - onError?: (error: Error) => void; - onSizeWarning?: (sizeInMB: number) => void; -} - -// Constants -const MAX_AUDIO_SIZE_MB = 25; -const MAX_RECORDING_DURATION_SECONDS = 600; // 10 minutes -const WARNING_SIZE_MB = 20; // Warn at 20MB - -export const useWhisper = ({ onTranscription, onError, onSizeWarning }: UseWhisperOptions = {}) => { - const [isRecording, setIsRecording] = useState(false); - const [isTranscribing, setIsTranscribing] = useState(false); - const [hasOpenAIKey, setHasOpenAIKey] = useState(false); - const [canUseDictation, setCanUseDictation] = useState(false); - const [audioContext, setAudioContext] = useState(null); - const [analyser, setAnalyser] = useState(null); - const [recordingDuration, setRecordingDuration] = useState(0); - const [estimatedSize, setEstimatedSize] = useState(0); - - const mediaRecorderRef = useRef(null); - const audioChunksRef = useRef([]); - const streamRef = useRef(null); - const recordingStartTimeRef = useRef(null); - const durationIntervalRef = useRef | null>(null); - const currentSizeRef = useRef(0); - - const { getProviders } = useConfig(); - const { settings: dictationSettings, hasElevenLabsKey } = useDictationSettings(); - - // Check if OpenAI API key is configured (regardless of current provider) - useEffect(() => { - const checkOpenAIKey = async () => { - try { - // Get all configured providers - const providers = await getProviders(false); - - // Find OpenAI provider - const openAIProvider = providers.find((p) => p.name === 'openai'); - - // Check if OpenAI is configured - if (openAIProvider && openAIProvider.is_configured) { - setHasOpenAIKey(true); - } else { - setHasOpenAIKey(false); - } - } catch (error) { - console.error('Error checking OpenAI configuration:', error); - setHasOpenAIKey(false); - } - }; - - checkOpenAIKey(); - }, [getProviders]); // Re-check when providers change - - // Check if dictation can be used based on settings - useEffect(() => { - if (!dictationSettings) { - setCanUseDictation(false); - return; - } - - if (!dictationSettings.enabled) { - setCanUseDictation(false); - return; - } - - // Check provider availability - switch (dictationSettings.provider) { - case DICTATION_PROVIDER_OPENAI: - setCanUseDictation(hasOpenAIKey); - break; - case DICTATION_PROVIDER_ELEVENLABS: - setCanUseDictation(hasElevenLabsKey); - break; - default: - setCanUseDictation(false); - } - }, [dictationSettings, hasOpenAIKey, hasElevenLabsKey]); - - // Define stopRecording before startRecording to avoid circular dependency - const stopRecording = useCallback(() => { - setIsRecording(false); - - if (mediaRecorderRef.current && mediaRecorderRef.current.state !== 'inactive') { - mediaRecorderRef.current.stop(); - } - - // Clear interval - if (durationIntervalRef.current) { - clearInterval(durationIntervalRef.current); - durationIntervalRef.current = null; - } - - // Stop all tracks in the stream - if (streamRef.current) { - streamRef.current.getTracks().forEach((track) => track.stop()); - streamRef.current = null; - } - - // Close audio context - if (audioContext && audioContext.state !== 'closed') { - audioContext.close().catch(console.error); - setAudioContext(null); - setAnalyser(null); - } - }, [audioContext]); - - // Cleanup effect to prevent memory leaks - useEffect(() => { - return () => { - // Cleanup on unmount - if (durationIntervalRef.current) { - clearInterval(durationIntervalRef.current); - } - if (streamRef.current) { - streamRef.current.getTracks().forEach((track) => track.stop()); - } - if (audioContext && audioContext.state !== 'closed') { - audioContext.close().catch(console.error); - } - }; - }, [audioContext]); - - const transcribeAudio = useCallback( - async (audioBlob: Blob) => { - if (!dictationSettings) { - stopRecording(); - onError?.(new Error('Dictation settings not loaded')); - return; - } - - setIsTranscribing(true); - - try { - // Check final size - const sizeMB = audioBlob.size / (1024 * 1024); - if (sizeMB > MAX_AUDIO_SIZE_MB) { - throw new Error( - `Audio file too large (${sizeMB.toFixed(1)}MB). Maximum size is ${MAX_AUDIO_SIZE_MB}MB.` - ); - } - - // Convert blob to base64 for easier transport - const reader = new FileReader(); - const base64Audio = await new Promise((resolve, reject) => { - reader.onloadend = () => { - const base64 = reader.result as string; - resolve(base64.split(',')[1]); // Remove data:audio/webm;base64, prefix - }; - reader.onerror = reject; - reader.readAsDataURL(audioBlob); - }); - - const mimeType = audioBlob.type; - if (!mimeType) { - throw new Error('Unable to determine audio format. Please try again.'); - } - - let endpoint = ''; - let headers: Record = { - 'Content-Type': 'application/json', - 'X-Secret-Key': await window.electron.getSecretKey(), - }; - - let body: Record = { - audio: base64Audio, - mime_type: mimeType, - }; - - // Choose endpoint based on provider - switch (dictationSettings.provider) { - case DICTATION_PROVIDER_OPENAI: - endpoint = '/audio/transcribe'; - break; - case DICTATION_PROVIDER_ELEVENLABS: - endpoint = '/audio/transcribe/elevenlabs'; - break; - default: - throw new Error(`Unsupported provider: ${dictationSettings.provider}`); - } - - const response = await fetch(getApiUrl(endpoint), { - method: 'POST', - headers, - body: JSON.stringify(body), - }); - - if (!response.ok) { - if (response.status === 404) { - throw new Error( - `Audio transcription endpoint not found. Please implement ${endpoint} endpoint in the Goose backend.` - ); - } else if (response.status === 401) { - throw new Error('Invalid API key. Please check your API key is correct.'); - } else if (response.status === 402) { - throw new Error('API quota exceeded. Please check your account limits.'); - } - const errorData = await safeJsonParse<{ - error: { message: string }; - }>(response, 'Failed to parse error response').catch(() => ({ - error: { message: 'Transcription failed' }, - })); - throw new Error(errorData.error?.message || 'Transcription failed'); - } - - const data = await safeJsonParse<{ text: string }>( - response, - 'Failed to parse transcription response' - ); - if (data.text) { - onTranscription?.(data.text); - } - } catch (error) { - console.error('Error transcribing audio:', error); - stopRecording(); - onError?.(error as Error); - } finally { - setIsTranscribing(false); - setRecordingDuration(0); - setEstimatedSize(0); - } - }, - [onTranscription, onError, dictationSettings, stopRecording] - ); - - const startRecording = useCallback(async () => { - if (!dictationSettings) { - stopRecording(); - onError?.(new Error('Dictation settings not loaded')); - return; - } - - try { - // Request microphone permission - const stream = await navigator.mediaDevices.getUserMedia({ - audio: { - echoCancellation: true, - noiseSuppression: true, - autoGainControl: true, - sampleRate: 44100, - }, - }); - streamRef.current = stream; - - // Verify we have valid audio tracks - const audioTracks = stream.getAudioTracks(); - if (audioTracks.length === 0) { - throw new Error('No audio tracks available in the microphone stream'); - } - - // AudioContext creation is disabled to prevent MediaRecorder conflicts - setAudioContext(null); - setAnalyser(null); - - // Determine best supported MIME type - const supportedTypes = ['audio/webm;codecs=opus', 'audio/webm', 'audio/mp4', 'audio/wav']; - - const mimeType = supportedTypes.find((type) => MediaRecorder.isTypeSupported(type)) || ''; - - const mediaRecorder = new MediaRecorder(stream, mimeType ? { mimeType } : {}); - - mediaRecorderRef.current = mediaRecorder; - audioChunksRef.current = []; - currentSizeRef.current = 0; - recordingStartTimeRef.current = Date.now(); - - // Start duration and size tracking - durationIntervalRef.current = setInterval(() => { - const elapsed = (Date.now() - (recordingStartTimeRef.current || 0)) / 1000; - setRecordingDuration(elapsed); - - // Estimate size based on typical webm bitrate (~128kbps) - const estimatedSizeMB = (elapsed * 128 * 1024) / (8 * 1024 * 1024); - setEstimatedSize(estimatedSizeMB); - - // Check if we're approaching the limit - if (estimatedSizeMB > WARNING_SIZE_MB) { - onSizeWarning?.(estimatedSizeMB); - } - - // Auto-stop if we hit the duration limit - if (elapsed >= MAX_RECORDING_DURATION_SECONDS) { - stopRecording(); - onError?.( - new Error( - `Maximum recording duration (${MAX_RECORDING_DURATION_SECONDS / 60} minutes) reached.` - ) - ); - } - }, 100); - - mediaRecorder.ondataavailable = (event) => { - if (event.data.size > 0) { - audioChunksRef.current.push(event.data); - currentSizeRef.current += event.data.size; - - // Check actual size - const actualSizeMB = currentSizeRef.current / (1024 * 1024); - if (actualSizeMB > MAX_AUDIO_SIZE_MB) { - stopRecording(); - onError?.(new Error(`Maximum file size (${MAX_AUDIO_SIZE_MB}MB) reached.`)); - } - } - }; - - mediaRecorder.onstop = async () => { - const audioBlob = new Blob(audioChunksRef.current, { type: mimeType || 'audio/webm' }); - - // Check if the blob is empty - if (audioBlob.size === 0) { - onError?.( - new Error( - 'No audio data was recorded. Please check your microphone permissions and try again.' - ) - ); - return; - } - - await transcribeAudio(audioBlob); - }; - - // Add error handler for MediaRecorder - mediaRecorder.onerror = (event) => { - console.error('MediaRecorder error:', event); - onError?.(new Error('Recording failed: Unknown error')); - }; - - if (!stream.active) { - throw new Error('Audio stream became inactive before recording could start'); - } - - // Check audio tracks again before starting recording - if (audioTracks.length === 0) { - throw new Error('No audio tracks available in the stream'); - } - - const activeAudioTracks = audioTracks.filter((track) => track.readyState === 'live'); - if (activeAudioTracks.length === 0) { - throw new Error('No live audio tracks available'); - } - - try { - mediaRecorder.start(100); - setIsRecording(true); - } catch (startError) { - console.error('Error calling mediaRecorder.start():', startError); - throw new Error(`Failed to start recording: ${errorMessage(startError)}`); - } - } catch (error) { - console.error('Error starting recording:', error); - stopRecording(); - onError?.(error as Error); - } - }, [onError, onSizeWarning, transcribeAudio, stopRecording, dictationSettings]); - - return { - isRecording, - isTranscribing, - hasOpenAIKey, - canUseDictation, - audioContext, - analyser, - startRecording, - stopRecording, - recordingDuration, - estimatedSize, - dictationSettings, - }; -}; diff --git a/ui/desktop/src/updates.ts b/ui/desktop/src/updates.ts index 756e73682c3e..14e49fc10a07 100644 --- a/ui/desktop/src/updates.ts +++ b/ui/desktop/src/updates.ts @@ -2,5 +2,4 @@ export const UPDATES_ENABLED = true; export const COST_TRACKING_ENABLED = true; export const ANNOUNCEMENTS_ENABLED = false; export const CONFIGURATION_ENABLED = true; -export const VOICE_DICTATION_ELEVENLABS_ENABLED = true; export const TELEMETRY_UI_ENABLED = true;