From c4697426dc2e1228f5300491c22683ddfd7390b3 Mon Sep 17 00:00:00 2001 From: yunyaozhou Date: Wed, 15 Apr 2026 00:21:00 +0800 Subject: [PATCH] feat(voice): add realtime dictation modes --- hub/src/web/routes/voice.ts | 150 +++++++++++++++++ web/src/api/client.ts | 28 +++- .../AssistantChat/HappyComposer.tsx | 48 +++++- web/src/components/SessionChat.tsx | 1 + web/src/hooks/useElevenLabsTranscription.ts | 158 ++++++++++++++++++ web/src/hooks/useSpeechToText.ts | 156 +++++++++++++++++ web/src/hooks/useVoiceMode.ts | 42 +++++ web/src/lib/locales/en.ts | 4 + web/src/lib/locales/zh-CN.ts | 4 + web/src/routes/settings/index.tsx | 38 ++++- web/src/types/api.ts | 11 ++ 11 files changed, 631 insertions(+), 9 deletions(-) create mode 100644 web/src/hooks/useElevenLabsTranscription.ts create mode 100644 web/src/hooks/useSpeechToText.ts create mode 100644 web/src/hooks/useVoiceMode.ts diff --git a/hub/src/web/routes/voice.ts b/hub/src/web/routes/voice.ts index 1a55f8363..e5cfa8b94 100644 --- a/hub/src/web/routes/voice.ts +++ b/hub/src/web/routes/voice.ts @@ -12,6 +12,19 @@ const tokenRequestSchema = z.object({ customApiKey: z.string().optional() }) +const scribeTokenRequestSchema = z.object({ + customApiKey: z.string().optional() +}) + +const transcriptionModelSchema = z.enum(['scribe_v1', 'scribe_v2']) + +const SUPPORTED_ELEVENLABS_LANGUAGE_CODES = new Set([ + 'en', 'ja', 'zh', 'de', 'hi', 'fr', 'ko', + 'pt', 'pt-br', 'it', 'es', 'id', 'nl', 'tr', 'pl', 'sv', 'bg', + 'ro', 'ar', 'cs', 'el', 'fi', 'ms', 'da', 'ta', 'uk', 'ru', + 'hu', 'hr', 'sk', 'no', 'vi', 'tl' +]) + // Cache for auto-created agent IDs (keyed by API key hash) const agentIdCache = new Map() @@ -20,6 +33,35 @@ interface ElevenLabsAgent { name: string } +interface ElevenLabsTool { + id: string + tool_config?: { + name?: string + type?: string + } +} + +function normalizeTranscriptionLanguageCode(raw: string | null): string | undefined { + if (!raw) return undefined + + const normalized = raw.trim().toLowerCase() + if (!normalized) return undefined + + if (SUPPORTED_ELEVENLABS_LANGUAGE_CODES.has(normalized)) { + return normalized + } + + if (normalized === 'pt-br' || normalized.startsWith('pt-br-')) { + return 'pt-br' + } + + const base = normalized.split(/[-_]/)[0] + if (base && SUPPORTED_ELEVENLABS_LANGUAGE_CODES.has(base)) { + return base + } + + return undefined +} /** * Find an existing "Hapi Voice Assistant" agent */ @@ -193,5 +235,113 @@ export function createVoiceRoutes(): Hono { } }) + app.post('/voice/transcribe', async (c) => { + const formData = await c.req.formData().catch(() => null) + if (!formData) { + return c.json({ error: 'Invalid form data' }, 400) + } + + const file = formData.get('file') + const modelIdRaw = formData.get('modelId') + const languageCodeRaw = formData.get('languageCode') + + if (!(file instanceof File)) { + return c.json({ error: 'Missing audio file' }, 400) + } + + const modelIdParsed = transcriptionModelSchema.safeParse( + typeof modelIdRaw === 'string' ? modelIdRaw : 'scribe_v2' + ) + if (!modelIdParsed.success) { + return c.json({ error: 'Invalid modelId' }, 400) + } + + const apiKey = process.env.ELEVENLABS_API_KEY + if (!apiKey) { + return c.json({ error: 'ElevenLabs API key not configured' }, 400) + } + + const upstreamFormData = new FormData() + upstreamFormData.set('model_id', modelIdParsed.data) + upstreamFormData.set('file', file, file.name || 'speech.webm') + const languageCode = typeof languageCodeRaw === 'string' + ? normalizeTranscriptionLanguageCode(languageCodeRaw) + : undefined + if (languageCode && modelIdParsed.data === 'scribe_v2') { + upstreamFormData.set('language_code', languageCode) + } + + try { + const response = await fetch(`${ELEVENLABS_API_BASE}/speech-to-text`, { + method: 'POST', + headers: { + 'xi-api-key': apiKey, + 'Accept': 'application/json' + }, + body: upstreamFormData + }) + + if (!response.ok) { + const errorData = await response.json().catch(() => ({})) as { detail?: { message?: string } | string; error?: string } + const errorMessage = typeof errorData.detail === 'string' + ? errorData.detail + : errorData.detail?.message || errorData.error || `ElevenLabs API error: ${response.status}` + return c.json({ error: errorMessage }, 500) + } + + const data = await response.json() as { text?: string; language_code?: string } + return c.json({ + text: data.text ?? '', + languageCode: data.language_code + }) + } catch (error) { + return c.json({ + error: error instanceof Error ? error.message : 'Network error' + }, 500) + } + }) + + app.post('/voice/scribe-token', async (c) => { + const json = await c.req.json().catch(() => null) + const parsed = scribeTokenRequestSchema.safeParse(json ?? {}) + if (!parsed.success) { + return c.json({ error: 'Invalid request body' }, 400) + } + + const apiKey = parsed.data.customApiKey || process.env.ELEVENLABS_API_KEY + if (!apiKey) { + return c.json({ error: 'ElevenLabs API key not configured' }, 400) + } + + try { + const response = await fetch(`${ELEVENLABS_API_BASE}/single-use-token/realtime_scribe`, { + method: 'POST', + headers: { + 'xi-api-key': apiKey, + 'Accept': 'application/json' + } + }) + + if (!response.ok) { + const errorData = await response.json().catch(() => ({})) as { detail?: { message?: string } | string; error?: string } + const errorMessage = typeof errorData.detail === 'string' + ? errorData.detail + : errorData.detail?.message || errorData.error || `ElevenLabs API error: ${response.status}` + return c.json({ error: errorMessage }, 500) + } + + const data = await response.json() as { token?: string } + if (!data.token) { + return c.json({ error: 'No token in ElevenLabs response' }, 500) + } + + return c.json({ token: data.token }) + } catch (error) { + return c.json({ + error: error instanceof Error ? error.message : 'Network error' + }, 500) + } + }) + return app } diff --git a/web/src/api/client.ts b/web/src/api/client.ts index 7f1083c8a..5ddd84693 100644 --- a/web/src/api/client.ts +++ b/web/src/api/client.ts @@ -19,6 +19,8 @@ import type { SpawnResponse, UploadFileResponse, VisibilityPayload, + VoiceScribeTokenResponse, + VoiceTranscriptionResponse, SessionResponse, SessionsResponse } from '@/types/api' @@ -94,7 +96,7 @@ export class ApiClient { if (authToken) { headers.set('authorization', `Bearer ${authToken}`) } - if (init?.body !== undefined && !headers.has('content-type')) { + if (init?.body !== undefined && !(init.body instanceof FormData) && !headers.has('content-type')) { headers.set('content-type', 'application/json') } @@ -443,4 +445,28 @@ export class ApiClient { body: JSON.stringify(options || {}) }) } + + async transcribeVoice( + file: File, + options?: { modelId?: 'scribe_v1' | 'scribe_v2'; languageCode?: string } + ): Promise { + const formData = new FormData() + formData.set('file', file) + formData.set('modelId', options?.modelId ?? 'scribe_v2') + if (options?.languageCode) { + formData.set('languageCode', options.languageCode) + } + + return await this.request('/api/voice/transcribe', { + method: 'POST', + body: formData + }) + } + + async fetchVoiceScribeToken(): Promise { + return await this.request('/api/voice/scribe-token', { + method: 'POST', + body: JSON.stringify({}) + }) + } } diff --git a/web/src/components/AssistantChat/HappyComposer.tsx b/web/src/components/AssistantChat/HappyComposer.tsx index cab3e014f..7f479b112 100644 --- a/web/src/components/AssistantChat/HappyComposer.tsx +++ b/web/src/components/AssistantChat/HappyComposer.tsx @@ -12,6 +12,7 @@ import { useRef, useState } from 'react' +import type { ApiClient } from '@/api/client' import type { AgentState, CodexCollaborationMode, PermissionMode } from '@/types/api' import type { Suggestion } from '@/hooks/useActiveSuggestions' import type { ConversationStatus } from '@/realtime/types' @@ -22,6 +23,10 @@ import { usePlatform } from '@/hooks/usePlatform' import { usePWAInstall } from '@/hooks/usePWAInstall' import { supportsEffort, supportsModelChange } from '@hapi/protocol' import { markSkillUsed } from '@/lib/recent-skills' +import { useComposerDraft } from '@/hooks/useComposerDraft' +import { useElevenLabsTranscription } from '@/hooks/useElevenLabsTranscription' +import { useSpeechToText } from '@/hooks/useSpeechToText' +import { useVoiceMode } from '@/hooks/useVoiceMode' import { FloatingOverlay } from '@/components/ChatInput/FloatingOverlay' import { Autocomplete } from '@/components/ChatInput/Autocomplete' import { StatusBar } from '@/components/AssistantChat/StatusBar' @@ -64,6 +69,7 @@ export function HappyComposer(props: { terminalUnsupported?: boolean autocompletePrefixes?: string[] autocompleteSuggestions?: (query: string) => Promise + voiceTranscriptionApi?: ApiClient // Voice assistant props voiceStatus?: ConversationStatus voiceMicMuted?: boolean @@ -96,6 +102,7 @@ export function HappyComposer(props: { terminalUnsupported = false, autocompletePrefixes = ['@', '/', '$'], autocompleteSuggestions = defaultSuggestionHandler, + voiceTranscriptionApi, voiceStatus = 'disconnected', voiceMicMuted = false, onVoiceToggle, @@ -165,6 +172,7 @@ export function HappyComposer(props: { }, [controlledByUser]) const { haptic: platformHaptic, isTouch } = usePlatform() + const { voiceMode } = useVoiceMode() const { isStandalone, isIOS } = usePWAInstall() const isIOSPWA = isIOS && isStandalone const bottomPaddingClass = isIOSPWA ? 'pb-0' : 'pb-3' @@ -185,6 +193,34 @@ export function HappyComposer(props: { } }, [platformHaptic]) + const dictation = useSpeechToText({ + getCurrentText: () => composerText, + onTextChange: (text) => api.composer().setText(text) + }) + const elevenLabsDictation = useElevenLabsTranscription({ + api: voiceTranscriptionApi ?? null, + getCurrentText: () => composerText, + onTextChange: (text) => api.composer().setText(text) + }) + + const effectiveVoiceStatus = voiceMode === 'dictation-local' + ? dictation.status + : voiceMode === 'dictation-elevenlabs' + ? elevenLabsDictation.status + : voiceStatus + const effectiveVoiceEnabled = voiceMode === 'dictation-local' + ? dictation.supported + : voiceMode === 'dictation-elevenlabs' + ? elevenLabsDictation.supported + : Boolean(onVoiceToggle) + const effectiveVoiceMicMuted = voiceMode === 'assistant' ? voiceMicMuted : false + const effectiveOnVoiceToggle = voiceMode === 'dictation-local' + ? dictation.toggle + : voiceMode === 'dictation-elevenlabs' + ? elevenLabsDictation.toggle + : onVoiceToggle + const effectiveOnVoiceMicToggle = voiceMode === 'assistant' ? onVoiceMicToggle : undefined + const handleSuggestionSelect = useCallback((index: number) => { const suggestion = suggestions[index] if (!suggestion || !textareaRef.current) return @@ -483,7 +519,7 @@ export function HappyComposer(props: { || showEffortSettings ) const showAbortButton = true - const voiceEnabled = Boolean(onVoiceToggle) + const voiceEnabled = effectiveVoiceEnabled const handleSend = useCallback(() => { api.composer().send() @@ -759,7 +795,7 @@ export function HappyComposer(props: { permissionMode={permissionMode} collaborationMode={collaborationMode} agentFlavor={agentFlavor} - voiceStatus={voiceStatus} + voiceStatus={effectiveVoiceStatus} />
@@ -804,10 +840,10 @@ export function HappyComposer(props: { isSwitching={isSwitching} onSwitch={handleSwitch} voiceEnabled={voiceEnabled} - voiceStatus={voiceStatus} - voiceMicMuted={voiceMicMuted} - onVoiceToggle={onVoiceToggle ?? (() => {})} - onVoiceMicToggle={onVoiceMicToggle} + voiceStatus={effectiveVoiceStatus} + voiceMicMuted={effectiveVoiceMicMuted} + onVoiceToggle={effectiveOnVoiceToggle ?? (() => {})} + onVoiceMicToggle={effectiveOnVoiceMicToggle} onSend={handleSend} />
diff --git a/web/src/components/SessionChat.tsx b/web/src/components/SessionChat.tsx index 1180aeb3a..6812327b5 100644 --- a/web/src/components/SessionChat.tsx +++ b/web/src/components/SessionChat.tsx @@ -419,6 +419,7 @@ export function SessionChat(props: { onTerminal={props.session.active && terminalSupported ? handleViewTerminal : undefined} terminalUnsupported={props.session.active && !terminalSupported} autocompleteSuggestions={props.autocompleteSuggestions} + voiceTranscriptionApi={props.api} voiceStatus={voice?.status} voiceMicMuted={voice?.micMuted} onVoiceToggle={voice ? handleVoiceToggle : undefined} diff --git a/web/src/hooks/useElevenLabsTranscription.ts b/web/src/hooks/useElevenLabsTranscription.ts new file mode 100644 index 000000000..c4831a88e --- /dev/null +++ b/web/src/hooks/useElevenLabsTranscription.ts @@ -0,0 +1,158 @@ +import { useCallback, useEffect, useMemo, useRef, useState } from 'react' +import { useScribe } from '@elevenlabs/react' +import type { ApiClient } from '@/api/client' +import type { ConversationStatus } from '@/realtime/types' +import { getElevenLabsCodeFromPreference } from '@/lib/languages' + +function normalizeText(baseText: string, transcript: string): string { + const trimmedTranscript = transcript.trim() + if (!trimmedTranscript) { + return baseText + } + + const prefix = baseText.trimEnd() + if (!prefix) { + return trimmedTranscript + } + + return `${prefix} ${trimmedTranscript}` +} + +export function useElevenLabsTranscription(config: { + api: ApiClient | null + getCurrentText: () => string + onTextChange: (text: string) => void +}): { + supported: boolean + status: ConversationStatus + start: () => Promise + stop: () => Promise + toggle: () => Promise +} { + const supported = typeof window !== 'undefined' + && typeof navigator !== 'undefined' + && typeof navigator.mediaDevices?.getUserMedia === 'function' + && config.api !== null + + const baseTextRef = useRef('') + const committedTextRef = useRef('') + const pendingStartRef = useRef(false) + const [localStatus, setLocalStatus] = useState('disconnected') + + const updateText = useCallback((partialTranscript: string) => { + config.onTextChange( + normalizeText( + baseTextRef.current, + `${committedTextRef.current} ${partialTranscript}`.trim() + ) + ) + }, [config]) + + const scribe = useScribe({ + modelId: 'scribe_v2_realtime', + onConnect: () => { + pendingStartRef.current = false + setLocalStatus('connected') + }, + onPartialTranscript: ({ text }) => { + updateText(text) + }, + onCommittedTranscript: ({ text }) => { + const trimmedText = text.trim() + if (!trimmedText) return + committedTextRef.current = `${committedTextRef.current} ${trimmedText}`.trim() + updateText('') + }, + onDisconnect: () => { + pendingStartRef.current = false + setLocalStatus('disconnected') + }, + onError: (error) => { + pendingStartRef.current = false + console.error('[Voice] ElevenLabs realtime transcription error:', error) + setLocalStatus('error') + } + }) + + const status = useMemo(() => { + if (localStatus === 'error') return 'error' + if (pendingStartRef.current) return 'connecting' + if (scribe.status === 'connecting') return 'connecting' + if (scribe.status === 'connected' || scribe.status === 'transcribing') return 'connected' + if (scribe.status === 'error') return 'error' + return localStatus + }, [localStatus, scribe.status]) + + const stop = useCallback(async () => { + pendingStartRef.current = false + setLocalStatus('disconnected') + if (!scribe.isConnected && scribe.status !== 'connecting') { + return + } + scribe.disconnect() + }, [scribe]) + + const start = useCallback(async () => { + if (!supported) { + return + } + + if (scribe.isConnected || scribe.status === 'connecting' || pendingStartRef.current) { + return + } + + pendingStartRef.current = true + setLocalStatus('connecting') + baseTextRef.current = config.getCurrentText() + committedTextRef.current = '' + scribe.clearTranscripts() + + try { + const tokenResponse = await config.api!.fetchVoiceScribeToken() + if (!tokenResponse.token) { + throw new Error(tokenResponse.error || 'Failed to fetch ElevenLabs realtime token') + } + + const languageCode = getElevenLabsCodeFromPreference(localStorage.getItem('hapi-voice-lang')) + + await scribe.connect({ + token: tokenResponse.token, + modelId: 'scribe_v2_realtime', + languageCode, + microphone: { + echoCancellation: true, + noiseSuppression: true, + autoGainControl: true, + channelCount: 1 + } + }) + } catch (error) { + pendingStartRef.current = false + console.error('[Voice] Failed to start ElevenLabs realtime transcription:', error) + setLocalStatus('error') + } + }, [config, scribe, supported]) + + const toggle = useCallback(async () => { + if (status === 'connected' || status === 'connecting') { + await stop() + return + } + await start() + }, [start, status, stop]) + + useEffect(() => { + return () => { + pendingStartRef.current = false + scribe.disconnect() + } + }, []) + + return { + supported, + status, + start, + stop, + toggle + } +} diff --git a/web/src/hooks/useSpeechToText.ts b/web/src/hooks/useSpeechToText.ts new file mode 100644 index 000000000..3c4b7d955 --- /dev/null +++ b/web/src/hooks/useSpeechToText.ts @@ -0,0 +1,156 @@ +import { useCallback, useEffect, useRef, useState } from 'react' +import type { ConversationStatus } from '@/realtime/types' + +interface SpeechRecognitionResultLike { + isFinal: boolean + 0: { + transcript: string + } +} + +interface SpeechRecognitionEventLike extends Event { + results: ArrayLike + resultIndex: number +} + +interface SpeechRecognitionLike extends EventTarget { + continuous: boolean + interimResults: boolean + lang: string + onstart: ((event: Event) => void) | null + onend: ((event: Event) => void) | null + onerror: ((event: Event & { error?: string }) => void) | null + onresult: ((event: SpeechRecognitionEventLike) => void) | null + start(): void + stop(): void +} + +type SpeechRecognitionConstructor = new () => SpeechRecognitionLike + +declare global { + interface Window { + SpeechRecognition?: SpeechRecognitionConstructor + webkitSpeechRecognition?: SpeechRecognitionConstructor + } +} + +function normalizeText(baseText: string, transcript: string): string { + const trimmedTranscript = transcript.trim() + if (!trimmedTranscript) { + return baseText + } + + const prefix = baseText.trimEnd() + if (!prefix) { + return trimmedTranscript + } + return `${prefix} ${trimmedTranscript}` +} + +function getRecognitionConstructor(): SpeechRecognitionConstructor | null { + return window.SpeechRecognition ?? window.webkitSpeechRecognition ?? null +} + +export function useSpeechToText(config: { + getCurrentText: () => string + onTextChange: (text: string) => void +}): { + supported: boolean + status: ConversationStatus + start: () => void + stop: () => void + toggle: () => void +} { + const supported = typeof window !== 'undefined' && getRecognitionConstructor() !== null + const [status, setStatus] = useState('disconnected') + const recognitionRef = useRef(null) + const baseTextRef = useRef('') + const finalizedTextRef = useRef('') + + const stop = useCallback(() => { + recognitionRef.current?.stop() + }, []) + + const start = useCallback(() => { + const Recognition = getRecognitionConstructor() + if (!Recognition) { + setStatus('error') + return + } + + recognitionRef.current?.stop() + setStatus('connecting') + + const recognition = new Recognition() + recognition.continuous = true + recognition.interimResults = true + recognition.lang = localStorage.getItem('hapi-voice-lang') || navigator.language || 'en-US' + + baseTextRef.current = config.getCurrentText() + finalizedTextRef.current = '' + + recognition.onstart = () => { + setStatus('connected') + } + + recognition.onresult = (event) => { + let finalizedChunk = '' + let interimChunk = '' + + for (let index = event.resultIndex; index < event.results.length; index += 1) { + const transcript = event.results[index]?.[0]?.transcript ?? '' + if (!transcript.trim()) continue + if (event.results[index].isFinal) { + finalizedChunk += `${transcript} ` + } else { + interimChunk += `${transcript} ` + } + } + + if (finalizedChunk.trim()) { + finalizedTextRef.current = `${finalizedTextRef.current} ${finalizedChunk}`.trim() + } + + const nextText = normalizeText( + baseTextRef.current, + `${finalizedTextRef.current} ${interimChunk}`.trim() + ) + config.onTextChange(nextText) + } + + recognition.onerror = () => { + recognitionRef.current = null + setStatus('error') + } + + recognition.onend = () => { + recognitionRef.current = null + setStatus((current) => (current === 'error' ? current : 'disconnected')) + } + + recognitionRef.current = recognition + recognition.start() + }, [config]) + + const toggle = useCallback(() => { + if (status === 'connected' || status === 'connecting') { + stop() + return + } + start() + }, [start, status, stop]) + + useEffect(() => { + return () => { + recognitionRef.current?.stop() + } + }, []) + + return { + supported, + status, + start, + stop, + toggle + } +} diff --git a/web/src/hooks/useVoiceMode.ts b/web/src/hooks/useVoiceMode.ts new file mode 100644 index 000000000..bdba7d7cb --- /dev/null +++ b/web/src/hooks/useVoiceMode.ts @@ -0,0 +1,42 @@ +import { useCallback, useEffect, useState } from 'react' + +export type VoiceMode = 'assistant' | 'dictation-local' | 'dictation-elevenlabs' + +const VOICE_MODE_STORAGE_KEY = 'hapi-voice-mode' +const VOICE_MODE_EVENT = 'hapi-voice-mode-change' + +function readVoiceMode(): VoiceMode { + const stored = localStorage.getItem(VOICE_MODE_STORAGE_KEY) + if (stored === 'dictation' || stored === 'dictation-local') return 'dictation-local' + if (stored === 'dictation-elevenlabs') return 'dictation-elevenlabs' + return 'assistant' +} + +export function useVoiceMode(): { + voiceMode: VoiceMode + setVoiceMode: (mode: VoiceMode) => void +} { + const [voiceMode, setVoiceModeState] = useState(readVoiceMode) + + useEffect(() => { + const sync = () => setVoiceModeState(readVoiceMode()) + + window.addEventListener('storage', sync) + window.addEventListener(VOICE_MODE_EVENT, sync) + return () => { + window.removeEventListener('storage', sync) + window.removeEventListener(VOICE_MODE_EVENT, sync) + } + }, []) + + const setVoiceMode = useCallback((mode: VoiceMode) => { + localStorage.setItem(VOICE_MODE_STORAGE_KEY, mode) + setVoiceModeState(mode) + window.dispatchEvent(new Event(VOICE_MODE_EVENT)) + }, []) + + return { + voiceMode, + setVoiceMode + } +} diff --git a/web/src/lib/locales/en.ts b/web/src/lib/locales/en.ts index 77b5e1fe0..397c93195 100644 --- a/web/src/lib/locales/en.ts +++ b/web/src/lib/locales/en.ts @@ -265,6 +265,10 @@ export default { 'settings.display.fontSize': 'Font Size', 'settings.display.terminalFontSize': 'Terminal Font Size', 'settings.voice.title': 'Voice Assistant', + 'settings.voice.mode': 'Voice Mode', + 'settings.voice.mode.assistant': 'Assistant', + 'settings.voice.mode.dictationLocal': 'Local Dictation', + 'settings.voice.mode.dictationElevenLabs': 'ElevenLabs Transcription', 'settings.voice.language': 'Voice Language', 'settings.voice.autoDetect': 'Auto-detect', 'settings.about.title': 'About', diff --git a/web/src/lib/locales/zh-CN.ts b/web/src/lib/locales/zh-CN.ts index ca698dce3..45fef366d 100644 --- a/web/src/lib/locales/zh-CN.ts +++ b/web/src/lib/locales/zh-CN.ts @@ -267,6 +267,10 @@ export default { 'settings.display.fontSize': '字体大小', 'settings.display.terminalFontSize': '终端字体大小', 'settings.voice.title': '语音助手', + 'settings.voice.mode': '语音模式', + 'settings.voice.mode.assistant': '语音助手', + 'settings.voice.mode.dictationLocal': '本地转写', + 'settings.voice.mode.dictationElevenLabs': 'ElevenLabs 转写', 'settings.voice.language': '语音语言', 'settings.voice.autoDetect': '自动检测', 'settings.about.title': '关于', diff --git a/web/src/routes/settings/index.tsx b/web/src/routes/settings/index.tsx index 971f2171f..2a004f9ff 100644 --- a/web/src/routes/settings/index.tsx +++ b/web/src/routes/settings/index.tsx @@ -5,6 +5,7 @@ import { getElevenLabsSupportedLanguages, getLanguageDisplayName, type Language import { getFontScaleOptions, useFontScale, type FontScale } from '@/hooks/useFontScale' import { getTerminalFontSizeOptions, useTerminalFontSize, type TerminalFontSize } from '@/hooks/useTerminalFontSize' import { useAppearance, getAppearanceOptions, type AppearancePreference } from '@/hooks/useTheme' +import { useVoiceMode, type VoiceMode } from '@/hooks/useVoiceMode' import { PROTOCOL_VERSION } from '@hapi/protocol' const locales: { value: Locale; nativeLabel: string }[] = [ @@ -13,6 +14,11 @@ const locales: { value: Locale; nativeLabel: string }[] = [ ] const voiceLanguages = getElevenLabsSupportedLanguages() +const voiceModeOptions: Array<{ value: VoiceMode; labelKey: string }> = [ + { value: 'assistant', labelKey: 'settings.voice.mode.assistant' }, + { value: 'dictation-local', labelKey: 'settings.voice.mode.dictationLocal' }, + { value: 'dictation-elevenlabs', labelKey: 'settings.voice.mode.dictationElevenLabs' } +] function BackIcon(props: { className?: string }) { return ( @@ -87,6 +93,7 @@ export default function SettingsPage() { const { fontScale, setFontScale } = useFontScale() const { terminalFontSize, setTerminalFontSize } = useTerminalFontSize() const { appearance, setAppearance } = useAppearance() + const { voiceMode, setVoiceMode } = useVoiceMode() // Voice language state - read from localStorage const [voiceLanguage, setVoiceLanguage] = useState(() => { @@ -101,6 +108,7 @@ export default function SettingsPage() { const currentFontScaleLabel = fontScaleOptions.find((opt) => opt.value === fontScale)?.label ?? '100%' const currentTerminalFontSizeLabel = terminalFontSizeOptions.find((opt) => opt.value === terminalFontSize)?.label ?? '13px' const currentVoiceLanguage = voiceLanguages.find((lang) => lang.code === voiceLanguage) + const currentVoiceModeLabel = voiceModeOptions.find((option) => option.value === voiceMode)?.labelKey ?? 'settings.voice.mode.assistant' const handleLocaleChange = (newLocale: Locale) => { setLocale(newLocale) @@ -132,6 +140,10 @@ export default function SettingsPage() { setIsVoiceOpen(false) } + const handleVoiceModeChange = (mode: VoiceMode) => { + setVoiceMode(mode) + } + // Close dropdown when clicking outside useEffect(() => { if (!isOpen && !isAppearanceOpen && !isFontOpen && !isTerminalFontOpen && !isVoiceOpen) return @@ -404,7 +416,29 @@ export default function SettingsPage() {
{t('settings.voice.title')}
-
+
+
{t('settings.voice.mode')}
+
+ {voiceModeOptions.map((option) => { + const isSelected = voiceMode === option.value + return ( + + ) + })} +
+
+