From f5a8eb69fc7ea386638c31b5cbf430b7e4099a86 Mon Sep 17 00:00:00 2001 From: heimoshuiyu Date: Thu, 5 Feb 2026 15:44:15 +0800 Subject: [PATCH 1/7] feat: add voice input feature move whisper config into config document whisper voice config remove tui voice enabled Fix voice error handling and whisper context --- packages/app/src/components/prompt-input.tsx | 224 ++++++++++++++++++ .../cli/cmd/tui/component/prompt/index.tsx | 128 ++++++++++ .../opencode/src/cli/cmd/tui/util/voice.ts | 132 +++++++++++ packages/opencode/src/config/config.ts | 26 ++ packages/opencode/src/server/routes/voice.ts | 46 ++++ packages/opencode/src/server/server.ts | 2 + packages/opencode/src/voice/whisper.ts | 153 ++++++++++++ packages/sdk/js/src/gen/types.gen.ts | 43 ++++ packages/sdk/js/src/v2/gen/sdk.gen.ts | 27 +++ packages/sdk/js/src/v2/gen/types.gen.ts | 63 +++++ packages/ui/src/components/icon.tsx | 1 + packages/web/src/content/docs/config.mdx | 27 +++ 12 files changed, 872 insertions(+) create mode 100644 packages/opencode/src/cli/cmd/tui/util/voice.ts create mode 100644 packages/opencode/src/server/routes/voice.ts create mode 100644 packages/opencode/src/voice/whisper.ts diff --git a/packages/app/src/components/prompt-input.tsx b/packages/app/src/components/prompt-input.tsx index b897e394aa18..198460c9323a 100644 --- a/packages/app/src/components/prompt-input.tsx +++ b/packages/app/src/components/prompt-input.tsx @@ -251,6 +251,16 @@ export const PromptInput: Component = (props) => { applyingHistory: false, }) + const [recording, setRecording] = createSignal(false) + const [transcribing, setTranscribing] = createSignal(false) + const audio = { + recorder: undefined as MediaRecorder | undefined, + stream: undefined as MediaStream | undefined, + controller: undefined as AbortController | undefined, + chunks: [] as Blob[], + mime: "", + } + const MAX_HISTORY = 100 const [history, setHistory] = persisted( Persist.global("prompt-history", ["prompt-history.v1"]), @@ -384,6 +394,204 @@ export const PromptInput: Component = (props) => { addPart({ type: "text", content: plainText, start: 0, end: 0 }) } + const isVoiceSupported = () => + typeof navigator !== "undefined" && + typeof window !== "undefined" && + Boolean(navigator.mediaDevices?.getUserMedia) && + typeof MediaRecorder !== "undefined" + + const stopStream = () => { + audio.stream?.getTracks().forEach((track) => track.stop()) + audio.stream = undefined + } + + const recordStart = async () => { + if (!isVoiceSupported()) { + showToast({ + title: "Voice input unavailable", + description: "Your browser does not support audio recording.", + }) + return false + } + if (audio.recorder) return false + + const stream = await navigator.mediaDevices + .getUserMedia({ audio: true }) + .catch(() => undefined) + if (!stream) { + showToast({ + title: "Microphone blocked", + description: "Allow microphone access to start recording.", + }) + return false + } + + // ensure we can clean up stream even if mime unsupported + audio.stream = stream + + const preferred = "audio/webm;codecs=opus" + const fallback = "audio/webm" + const mime = MediaRecorder.isTypeSupported(preferred) + ? preferred + : MediaRecorder.isTypeSupported(fallback) + ? fallback + : "" + if (!mime) { + stopStream() + showToast({ + title: "Voice input unavailable", + description: "This browser does not support the available audio formats.", + }) + return false + } + const recorder = new MediaRecorder(stream, { mimeType: mime }) + + audio.mime = recorder.mimeType || mime + audio.chunks = [] + audio.recorder = recorder + + recorder.ondataavailable = (event) => { + if (event.data.size === 0) return + audio.chunks.push(event.data) + } + + recorder.start() + setRecording(true) + return true + } + + const recordStop = async () => { + if (!audio.recorder) return + const recorder = audio.recorder + audio.recorder = undefined + + const result = new Promise((resolve) => { + recorder.onstop = () => { + resolve(new Blob(audio.chunks, { type: audio.mime || "audio/webm" })) + } + }) + + recorder.stop() + const blob = await result + stopStream() + setRecording(false) + return blob + } + + const transcribeAudio = async (blob: Blob) => { + if (!blob.size) { + showToast({ + title: "No audio captured", + description: "Try recording again.", + }) + return + } + + const mime = blob.type || "audio/webm" + const filename = mime.includes("webm") ? "audio.webm" : "audio.dat" + const file = new File([blob], filename, { type: mime }) + const form = new FormData() + const currentPrompt = prompt.current() + const promptText = currentPrompt.map((part) => ("content" in part ? part.content : "")).join("") + form.append("file", file) + if (params.id) { + form.append("sessionID", params.id) + } + if (promptText.trim()) { + form.append("prompt", promptText) + } + + const fetcher = platform.fetch ?? fetch + const controller = new AbortController() + audio.controller = controller + setTranscribing(true) + const response = await fetcher(`${sdk.url}/voice/transcribe`, { + method: "POST", + body: form, + signal: controller.signal, + }).catch(() => undefined) + + audio.controller = undefined + + if (!response) { + setTranscribing(false) + if (controller.signal.aborted) return + showToast({ + title: "Transcription failed", + description: "Failed to reach the server.", + }) + return + } + + const payload = await response.json().catch(() => ({ text: "" })) + const text = typeof payload?.text === "string" ? payload.text : "" + setTranscribing(false) + + if (!response.ok) { + if (controller.signal.aborted) return + showToast({ + title: "Transcription failed", + description: text || "Request failed.", + }) + return + } + + if (!text.trim()) { + showToast({ + title: "No speech detected", + description: "Try speaking closer to the microphone.", + }) + return + } + + addPart({ type: "text", content: text, start: 0, end: 0 }) + requestAnimationFrame(() => { + editorRef.focus() + queueScroll() + }) + } + + const toggleVoice = async () => { + if (transcribing()) { + const controller = audio.controller + if (controller) { + controller.abort() + setTranscribing(false) + showToast({ + title: "Transcription cancelled", + description: "Stopped the current transcription.", + }) + } + return + } + + if (recording()) { + const blob = await recordStop() + if (!blob) return + await transcribeAudio(blob) + return + } + + await recordStart() + } + + const voiceTitle = createMemo(() => + transcribing() ? "Cancel transcription" : recording() ? "Stop recording" : "Voice input", + ) + + command.register(() => [ + { + id: "prompt.voice", + title: "Voice input", + description: "Start or stop voice recording", + category: "Prompt", + keybind: "mod+shift+m", + onSelect: () => { + void toggleVoice() + }, + }, + ]) + const handleGlobalDragOver = (event: DragEvent) => { if (dialog.active) return @@ -428,6 +636,13 @@ export const PromptInput: Component = (props) => { document.removeEventListener("dragover", handleGlobalDragOver) document.removeEventListener("dragleave", handleGlobalDragLeave) document.removeEventListener("drop", handleGlobalDrop) + if (transcribing()) { + const controller = audio.controller + if (controller) controller.abort() + setTranscribing(false) + } + if (!recording()) return + void recordStop() }) createEffect(() => { @@ -2049,6 +2264,15 @@ export const PromptInput: Component = (props) => { + + + sync.data.config.tui?.voice) + const voice = Voice.create({ + config: voiceConfig, + transcription: () => sync.data.config.voice, + sessionID: () => props.sessionID, + prompt: () => store.prompt.input, + }) const fileStyleId = syntax().getStyleId("extmark.file")! const agentStyleId = syntax().getStyleId("extmark.agent")! @@ -123,6 +131,8 @@ export function Prompt(props: PromptProps) { extmarkToPartIndex: Map interrupt: number placeholder: number + recording: boolean + processing: boolean }>({ placeholder: Math.floor(Math.random() * PLACEHOLDERS.length), prompt: { @@ -132,6 +142,8 @@ export function Prompt(props: PromptProps) { mode: "normal", extmarkToPartIndex: new Map(), interrupt: 0, + recording: false, + processing: false, }) // Initialize agent/model/variant from last user message when session changes @@ -180,6 +192,16 @@ export function Prompt(props: PromptProps) { dialog.clear() }, }, + { + title: "Voice input", + value: "prompt.voice", + disabled: true, + keybind: "input_voice", + category: "Prompt", + onSelect: async () => { + await toggleVoice() + }, + }, { title: "Paste", value: "prompt.paste", @@ -680,6 +702,84 @@ export function Prompt(props: PromptProps) { ) } + async function toggleVoice() { + if (store.processing) { + const cancelled = voice.cancel() + if (cancelled) { + setStore("processing", false) + toast.show({ + message: "Transcription cancelled", + variant: "info", + duration: 1500, + }) + } + return + } + + if (store.recording) { + setStore("recording", false) + setStore("processing", true) + const result = await voice.stop().catch((error) => { + toast.error(error) + return undefined + }) + setStore("processing", false) + if (result?.cancelled) return + if (!result) { + toast.show({ + message: "Recording failed (empty audio)", + variant: "warning", + }) + return + } + if (!result.text.trim()) { + toast.show({ + message: "No speech detected (Whisper returned empty text)", + variant: "warning", + }) + return + } + + input.insertText(result.text) + input.getLayoutNode().markDirty() + input.gotoBufferEnd() + renderer.requestRender() + return + } + + const enabled = voice.isEnabled() + if (!enabled) { + toast.show({ + message: "Voice input unavailable (disabled or missing OPENCODE_WHISPER_API_KEY)", + variant: "warning", + }) + return + } + + setStore("recording", true) + toast.show({ + message: "Recording... press keybind again to stop", + variant: "info", + duration: 2000, + }) + const ok = await voice.start().catch((error) => { + toast.error(error) + return false + }) + if (ok) return + setStore("recording", false) + toast.show({ + message: "Failed to start recording", + variant: "error", + }) + } + + onCleanup(() => { + if (store.processing) voice.cancel() + if (!store.recording) return + voice.stop().catch(() => {}) + }) + async function pasteImage(file: { filename?: string; content: string; mime: string }) { const currentOffset = input.visualCursor.offset const extmarkStart = currentOffset @@ -736,6 +836,19 @@ export function Prompt(props: PromptProps) { return !!current }) + const voiceEnabled = createMemo(() => voice.isEnabled()) + const voiceLabel = createMemo(() => { + if (store.processing) return "Transcribing" + if (store.recording) return "Stop" + return "Record" + }) + const voiceColor = createMemo(() => { + if (store.processing) return theme.warning + if (store.recording) return theme.warning + if (!voiceEnabled()) return theme.textMuted + return theme.text + }) + const spinnerDef = createMemo(() => { const color = local.agent.color(local.agent.current().name) return { @@ -831,6 +944,11 @@ export function Prompt(props: PromptProps) { } // If no image, let the default paste behavior continue } + if (keybind.match("input_voice", e)) { + e.preventDefault() + await toggleVoice() + return + } if (keybind.match("input_clear", e) && store.prompt.input !== "") { input.clear() input.extmarks.clear() @@ -991,6 +1109,16 @@ export function Prompt(props: PromptProps) { + + { + if (!voiceEnabled() && !store.recording && !store.processing) return + await toggleVoice() + }} + > + {voiceLabel()} + diff --git a/packages/opencode/src/cli/cmd/tui/util/voice.ts b/packages/opencode/src/cli/cmd/tui/util/voice.ts new file mode 100644 index 000000000000..f176c8495e84 --- /dev/null +++ b/packages/opencode/src/cli/cmd/tui/util/voice.ts @@ -0,0 +1,132 @@ +import { tmpdir } from "os" +import path from "path" +import { Config } from "@/config/config" +import { Whisper } from "@/voice/whisper" + +export type VoiceConfig = { + command?: string[] + mime?: string +} + +const defaultCommands = [ + ["ffmpeg", "-y", "-f", "pulse", "-i", "default", "-ac", "1", "-ar", "16000", "-f", "mp3", "{output}"], + ["ffmpeg", "-y", "-f", "alsa", "-i", "default", "-ac", "1", "-ar", "16000", "-f", "mp3", "{output}"], + ["sox", "-d", "-c", "1", "-r", "16000", "{output}"], + ["rec", "-c", "1", "-r", "16000", "{output}"], + ["arecord", "-f", "S16_LE", "-c", "1", "-r", "16000", "{output}"], +] + +const defaultMime = "audio/mpeg" + +const pickCommand = (config?: VoiceConfig) => { + if (config?.command?.length) return config.command + for (const candidate of defaultCommands) { + const bin = candidate[0] + if (!bin) continue + if (Bun.which(bin)) return candidate + } + return undefined +} + +const readStream = async (stream?: ReadableStream | number | null) => { + if (!stream || typeof stream === "number") return "" + return new Response(stream).text().catch(() => "") +} + +export namespace Voice { + export function create(input: { + config: () => VoiceConfig | undefined + transcription?: () => Config.Info["voice"] | undefined + sessionID?: () => string | undefined + prompt?: () => string | undefined + }) { + const state = { + proc: undefined as ReturnType | undefined, + output: undefined as string | undefined, + controller: undefined as AbortController | undefined, + cancelled: false, + } + + const isEnabled = () => { + if (!input.transcription?.()?.whisper?.apiKey) return false + return true + } + + const start = async () => { + if (state.proc) return false + const config = input.config() + const command = pickCommand(config) + if (!command) return false + state.output = path.join(tmpdir(), `opencode-voice-${crypto.randomUUID()}.mp3`) + const args = command.map((entry) => entry.replaceAll("{output}", state.output!)) + state.proc = Bun.spawn(args, { stdout: "pipe", stderr: "pipe" }) + console.log("voice recorder started", { args, output: state.output }) + await Bun.sleep(100) + return true + } + + const stop = async () => { + if (!state.proc || !state.output) return + const target = state.proc + state.proc = undefined + const pathResult = state.output + state.output = undefined + target.kill() + await target.exited.catch(() => {}) + + const stdout = await readStream(target.stdout) + const stderr = await readStream(target.stderr) + if (stdout || stderr) { + console.log("voice recorder output", { stdout, stderr }) + } + + const mime = input.config()?.mime ?? defaultMime + const buffer = await Bun.file(pathResult).arrayBuffer().catch(() => undefined) + console.log("voice recorder bytes", { bytes: buffer?.byteLength ?? 0 }) + await Bun.file(pathResult).delete().catch(() => {}) + if (!buffer) return + + const blob = new Blob([buffer], { type: mime }) + const apiFile = new File([blob], "audio.mp3", { type: mime }) + console.log("whisper transcribe start", { + bytes: buffer.byteLength, + url: input.transcription?.()?.whisper?.url, + model: input.transcription?.()?.whisper?.model, + language: input.transcription?.()?.whisper?.language, + }) + state.cancelled = false + state.controller = new AbortController() + const result = await Whisper.transcribe({ + file: apiFile, + mime, + sessionID: input.sessionID?.(), + prompt: input.prompt?.(), + signal: state.controller.signal, + voice: input.transcription?.(), + }) + .then((response) => ({ text: response.text, cancelled: false })) + .catch((error) => { + console.log("whisper transcribe failed", { error: String(error) }) + if (error?.name === "AbortError" || state.cancelled) return { text: "", cancelled: true } + throw error + }) + state.controller = undefined + if (!result) return + return result + } + + const cancel = () => { + if (!state.controller) return false + state.cancelled = true + state.controller.abort() + return true + } + + return { + isEnabled, + start, + stop, + cancel, + } + } +} diff --git a/packages/opencode/src/config/config.ts b/packages/opencode/src/config/config.ts index dfb86dbe26f3..864b69c8eafa 100644 --- a/packages/opencode/src/config/config.ts +++ b/packages/opencode/src/config/config.ts @@ -823,6 +823,7 @@ export namespace Config { variant_cycle: z.string().optional().default("ctrl+t").describe("Cycle model variants"), input_clear: z.string().optional().default("ctrl+c").describe("Clear input field"), input_paste: z.string().optional().default("ctrl+v").describe("Paste from clipboard"), + input_voice: z.string().optional().default("v").describe("Toggle voice input"), input_submit: z.string().optional().default("return").describe("Submit input"), input_newline: z .string() @@ -929,8 +930,32 @@ export namespace Config { .enum(["auto", "stacked"]) .optional() .describe("Control diff rendering style: 'auto' adapts to terminal width, 'stacked' always shows single column"), + voice: z + .object({ + command: z + .array(z.string()) + .optional() + .describe("Recorder command template with {output} placeholder"), + mime: z.string().optional().describe("Recorded audio mime type"), + }) + .optional() + .describe("Voice input settings"), }) + export const Voice = z + .object({ + whisper: z + .object({ + url: z.string().optional().describe("Whisper API URL"), + apiKey: z.string().optional().describe("Whisper API key"), + model: z.string().optional().describe("Whisper model name"), + language: z.string().optional().describe("Whisper language code"), + }) + .optional() + .describe("Whisper transcription settings"), + }) + .describe("Voice transcription settings") + export const Server = z .object({ port: z.number().int().positive().optional().describe("Port to listen on"), @@ -1009,6 +1034,7 @@ export namespace Config { keybinds: Keybinds.optional().describe("Custom keybind configurations"), logLevel: Log.Level.optional().describe("Log level"), tui: TUI.optional().describe("TUI specific settings"), + voice: Voice.optional().describe("Voice transcription settings"), server: Server.optional().describe("Server configuration for opencode serve and web commands"), command: z .record(z.string(), Command) diff --git a/packages/opencode/src/server/routes/voice.ts b/packages/opencode/src/server/routes/voice.ts new file mode 100644 index 000000000000..f044a4d506fe --- /dev/null +++ b/packages/opencode/src/server/routes/voice.ts @@ -0,0 +1,46 @@ +import { describeRoute, resolver } from "hono-openapi" +import { zValidator } from "@hono/zod-validator" +import z from "zod" +import { Whisper } from "@/voice/whisper" +import { lazy } from "@/util/lazy" +import { Hono } from "hono" + +export const VoiceRoutes = lazy(() => + new Hono().post( + "/transcribe", + describeRoute({ + summary: "Transcribe audio", + description: "Transcribe an audio file with Whisper", + operationId: "audio.transcribe", + responses: { + 200: { + description: "Transcription result", + content: { + "application/json": { + schema: resolver(Whisper.Response), + }, + }, + }, + }, + }), + zValidator( + "form", + z.object({ + file: z.instanceof(File), + sessionID: z.string().optional(), + prompt: z.string().optional(), + }), + ), + async (c) => { + const data = c.req.valid("form") + const file = data.file + const result = await Whisper.transcribe({ + file, + mime: file.type || "audio/wav", + sessionID: data.sessionID, + prompt: data.prompt, + }) + return c.json(result) + }, + ), +) diff --git a/packages/opencode/src/server/server.ts b/packages/opencode/src/server/server.ts index 015553802a47..e01aa4276448 100644 --- a/packages/opencode/src/server/server.ts +++ b/packages/opencode/src/server/server.ts @@ -40,6 +40,7 @@ import { QuestionRoutes } from "./routes/question" import { PermissionRoutes } from "./routes/permission" import { GlobalRoutes } from "./routes/global" import { MDNS } from "./mdns" +import { VoiceRoutes } from "./routes/voice" // @ts-ignore This global is needed to prevent ai-sdk from logging warnings to stdout https://github.com/vercel/ai/blob/2dc67e0ef538307f21368db32d5a12345d98831b/packages/ai/src/logger/log-warnings.ts#L85 globalThis.AI_SDK_LOG_WARNINGS = false @@ -224,6 +225,7 @@ export namespace Server { .route("/permission", PermissionRoutes()) .route("/question", QuestionRoutes()) .route("/provider", ProviderRoutes()) + .route("/voice", VoiceRoutes()) .route("/", FileRoutes()) .route("/mcp", McpRoutes()) .route("/tui", TuiRoutes()) diff --git a/packages/opencode/src/voice/whisper.ts b/packages/opencode/src/voice/whisper.ts new file mode 100644 index 000000000000..8a3c1f91f424 --- /dev/null +++ b/packages/opencode/src/voice/whisper.ts @@ -0,0 +1,153 @@ +import { Config } from "@/config/config" +import { Session } from "@/session" +import { tmpdir } from "os" +import path from "path" +import z from "zod" + +const toWavOrMp3 = async (input: { buffer: ArrayBuffer; mime: string }) => { + const isWav = input.mime.includes("wav") + const isMp3 = input.mime.includes("mpeg") || input.mime.includes("mp3") + if (isWav || isMp3) { + const name = isWav ? "audio.wav" : "audio.mp3" + const mime = isWav ? "audio/wav" : "audio/mpeg" + return { buffer: input.buffer, name, mime } + } + + const outPath = path.join(tmpdir(), `opencode-voice-${crypto.randomUUID()}.mp3`) + const proc = Bun.spawn( + [ + "ffmpeg", + "-y", + "-f", + "webm", + "-i", + "pipe:0", + "-ac", + "1", + "-ar", + "16000", + "-f", + "mp3", + outPath, + ], + { + stdin: "pipe", + stdout: "ignore", + stderr: "ignore", + }, + ) + proc.stdin?.write(new Uint8Array(input.buffer)) + proc.stdin?.end() + await proc.exited + + const file = Bun.file(outPath, { type: "audio/mpeg" }) + const buffer = await file.arrayBuffer().catch(() => undefined) + await Bun.file(outPath).delete().catch(() => {}) + if (!buffer) throw new Error("Failed to convert audio") + return { buffer, name: "audio.mp3", mime: "audio/mpeg" } +} + +const getLastAssistantText = async (sessionID?: string) => { + if (!sessionID) return "" + return Promise.resolve() + .then(() => Session.messages({ sessionID, limit: 50 })) + .then((messages) => { + for (let i = messages.length - 1; i >= 0; i -= 1) { + const msg = messages[i] + if (msg.info.role !== "assistant") continue + const text = msg.parts + .filter((part) => part.type === "text") + .map((part) => part.text) + .join(" ") + .trim() + if (text) return text + } + return "" + }) + .catch((error) => { + console.log("whisper session lookup failed", { error: String(error) }) + return "" + }) +} + +const buildPrompt = (input: { prompt?: string; assistant?: string }) => { + const head = input.assistant?.trim() ?? "" + const tail = input.prompt?.trim() ?? "" + if (!head) return tail + if (!tail) return head + return `${head} ${tail}` +} + +export namespace Whisper { + export const Request = z.object({ + file: z.instanceof(File), + mime: z.string(), + sessionID: Session.Info.shape.id.optional(), + prompt: z.string().optional(), + }) + + export const Response = z.object({ + text: z.string().default(""), + }) + + export type Response = z.infer + + export async function transcribe( + input: z.infer & { signal?: AbortSignal; voice?: Config.Info["voice"] }, + ) { + const voice = input.voice ?? (await Config.get()).voice + const whisper = voice?.whisper + const apiKey = whisper?.apiKey + if (!apiKey) { + throw new Error("Missing voice.whisper.apiKey") + } + + const content = await input.file.arrayBuffer() + const prepared = await toWavOrMp3({ + buffer: content, + mime: input.mime, + }) + + const assistant = await getLastAssistantText(input.sessionID) + const prompt = buildPrompt({ assistant, prompt: input.prompt }) + + const form = new FormData() + form.append("file", new Blob([prepared.buffer], { type: prepared.mime }), prepared.name) + form.append("model", whisper?.model ?? "whisper-1") + form.append("response_format", "json") + if (whisper?.language) { + form.append("language", whisper.language) + } + if (prompt) { + form.append("prompt", prompt) + } + + const url = whisper?.url ?? "http://127.0.0.1:5000/v1/audio/transcriptions" + console.log("whisper request", { + url, + model: whisper?.model ?? "whisper-1", + language: whisper?.language, + bytes: prepared.buffer.byteLength, + }) + const result = await fetch(url, { + method: "POST", + headers: { + Authorization: `Bearer ${apiKey}`, + }, + body: form, + signal: input.signal, + }) + + if (!result.ok) { + const message = await result.text().catch(() => "") + throw new Error(message || "Whisper request failed") + } + + const contentType = result.headers.get("content-type") ?? "" + const body = await result.text().catch(() => "") + console.log("whisper response", { contentType, body }) + const payload = body ? JSON.parse(body) : { text: "" } + const text = typeof payload?.text === "string" ? payload.text : "" + return Response.parse({ text }) + } +} diff --git a/packages/sdk/js/src/gen/types.gen.ts b/packages/sdk/js/src/gen/types.gen.ts index 8eefe5bfe985..43ad006237d3 100644 --- a/packages/sdk/js/src/gen/types.gen.ts +++ b/packages/sdk/js/src/gen/types.gen.ts @@ -938,6 +938,10 @@ export type KeybindsConfig = { * Paste from clipboard */ input_paste?: string + /** + * Toggle voice input + */ + input_voice?: string /** * Submit input */ @@ -1207,6 +1211,45 @@ export type Config = { * Control diff rendering style: 'auto' adapts to terminal width, 'stacked' always shows single column */ diff_style?: "auto" | "stacked" + /** + * Voice input settings + */ + voice?: { + /** + * Recorder command template with {output} placeholder + */ + command?: Array + /** + * Recorded audio mime type + */ + mime?: string + } + } + /** + * Voice transcription settings + */ + voice?: { + /** + * Whisper transcription settings + */ + whisper?: { + /** + * Whisper API URL + */ + url?: string + /** + * Whisper API key + */ + apiKey?: string + /** + * Whisper model name + */ + model?: string + /** + * Whisper language code + */ + language?: string + } } /** * Command configuration, see https://opencode.ai/docs/commands diff --git a/packages/sdk/js/src/v2/gen/sdk.gen.ts b/packages/sdk/js/src/v2/gen/sdk.gen.ts index b757b7535075..cfb40034fd3d 100644 --- a/packages/sdk/js/src/v2/gen/sdk.gen.ts +++ b/packages/sdk/js/src/v2/gen/sdk.gen.ts @@ -8,6 +8,7 @@ import type { AppLogErrors, AppLogResponses, AppSkillsResponses, + AudioTranscribeResponses, Auth as Auth3, AuthRemoveErrors, AuthRemoveResponses, @@ -2161,6 +2162,27 @@ export class Provider extends HeyApiClient { } } +export class Audio extends HeyApiClient { + /** + * Transcribe audio + * + * Transcribe an audio file with Whisper + */ + public transcribe( + parameters?: { + directory?: string + }, + options?: Options, + ) { + const params = buildClientParams([parameters], [{ args: [{ in: "query", key: "directory" }] }]) + return (options?.client ?? this.client).post({ + url: "/voice/transcribe", + ...options, + ...params, + }) + } +} + export class Find extends HeyApiClient { /** * Find text @@ -3251,6 +3273,11 @@ export class OpencodeClient extends HeyApiClient { return (this._provider ??= new Provider({ client: this.client })) } + private _audio?: Audio + get audio(): Audio { + return (this._audio ??= new Audio({ client: this.client })) + } + private _find?: Find get find(): Find { return (this._find ??= new Find({ client: this.client })) diff --git a/packages/sdk/js/src/v2/gen/types.gen.ts b/packages/sdk/js/src/v2/gen/types.gen.ts index cb1606e3f610..89f6c8142ec7 100644 --- a/packages/sdk/js/src/v2/gen/types.gen.ts +++ b/packages/sdk/js/src/v2/gen/types.gen.ts @@ -1137,6 +1137,10 @@ export type KeybindsConfig = { * Paste from clipboard */ input_paste?: string + /** + * Toggle voice input + */ + input_voice?: string /** * Submit input */ @@ -1640,6 +1644,45 @@ export type Config = { * Control diff rendering style: 'auto' adapts to terminal width, 'stacked' always shows single column */ diff_style?: "auto" | "stacked" + /** + * Voice input settings + */ + voice?: { + /** + * Recorder command template with {output} placeholder + */ + command?: Array + /** + * Recorded audio mime type + */ + mime?: string + } + } + /** + * Voice transcription settings + */ + voice?: { + /** + * Whisper transcription settings + */ + whisper?: { + /** + * Whisper API URL + */ + url?: string + /** + * Whisper API key + */ + apiKey?: string + /** + * Whisper model name + */ + model?: string + /** + * Whisper language code + */ + language?: string + } } server?: ServerConfig /** @@ -4123,6 +4166,26 @@ export type ProviderOauthCallbackResponses = { export type ProviderOauthCallbackResponse = ProviderOauthCallbackResponses[keyof ProviderOauthCallbackResponses] +export type AudioTranscribeData = { + body?: never + path?: never + query?: { + directory?: string + } + url: "/voice/transcribe" +} + +export type AudioTranscribeResponses = { + /** + * Transcription result + */ + 200: { + text?: string + } +} + +export type AudioTranscribeResponse = AudioTranscribeResponses[keyof AudioTranscribeResponses] + export type FindTextData = { body?: never path?: never diff --git a/packages/ui/src/components/icon.tsx b/packages/ui/src/components/icon.tsx index 544c6abdd214..8e16c8214c5e 100644 --- a/packages/ui/src/components/icon.tsx +++ b/packages/ui/src/components/icon.tsx @@ -61,6 +61,7 @@ const icons = { share: ``, download: ``, menu: ``, + mic: ``, server: ``, branch: ``, edit: ``, diff --git a/packages/web/src/content/docs/config.mdx b/packages/web/src/content/docs/config.mdx index 5cc9d8666a96..5433a0c73cba 100644 --- a/packages/web/src/content/docs/config.mdx +++ b/packages/web/src/content/docs/config.mdx @@ -179,6 +179,33 @@ Available options: --- +### Voice + +Configure voice transcription for the Whisper API with the `voice` option. + +```json title="opencode.json" +{ + "$schema": "https://opencode.ai/config.json", + "voice": { + "whisper": { + "url": "http://127.0.0.1:5000/v1/audio/transcriptions", + "apiKey": "{env:OPENCODE_WHISPER_API_KEY}", + "model": "whisper-1", + "language": "en" + } + } +} +``` + +Available options: + +- `whisper.url` - Whisper transcription endpoint URL. +- `whisper.apiKey` - API key for the Whisper service. +- `whisper.model` - Whisper model name (default: `whisper-1`). +- `whisper.language` - Optional language hint (e.g. `en`). + +--- + ### Server You can configure server settings for the `opencode serve` and `opencode web` commands through the `server` option. From 28813065128c1f21f53309893d2333d33dbc80b6 Mon Sep 17 00:00:00 2001 From: heimoshuiyu Date: Sun, 18 Jan 2026 23:18:55 +0800 Subject: [PATCH 2/7] feat: add ALM voice transcription --- .../cli/cmd/tui/component/prompt/index.tsx | 4 +- .../opencode/src/cli/cmd/tui/util/voice.ts | 68 +++++++++--- packages/opencode/src/config/config.ts | 11 ++ packages/opencode/src/server/routes/voice.ts | 37 +++++-- packages/opencode/src/voice/alm.ts | 100 ++++++++++++++++++ packages/opencode/src/voice/whisper.ts | 6 +- packages/web/src/content/docs/config.mdx | 25 ++++- 7 files changed, 221 insertions(+), 30 deletions(-) create mode 100644 packages/opencode/src/voice/alm.ts diff --git a/packages/opencode/src/cli/cmd/tui/component/prompt/index.tsx b/packages/opencode/src/cli/cmd/tui/component/prompt/index.tsx index c5463241e650..ec45e0feab67 100644 --- a/packages/opencode/src/cli/cmd/tui/component/prompt/index.tsx +++ b/packages/opencode/src/cli/cmd/tui/component/prompt/index.tsx @@ -734,7 +734,7 @@ export function Prompt(props: PromptProps) { } if (!result.text.trim()) { toast.show({ - message: "No speech detected (Whisper returned empty text)", + message: "No speech detected (transcription returned empty text)", variant: "warning", }) return @@ -750,7 +750,7 @@ export function Prompt(props: PromptProps) { const enabled = voice.isEnabled() if (!enabled) { toast.show({ - message: "Voice input unavailable (disabled or missing OPENCODE_WHISPER_API_KEY)", + message: "Voice input unavailable (missing transcription API key)", variant: "warning", }) return diff --git a/packages/opencode/src/cli/cmd/tui/util/voice.ts b/packages/opencode/src/cli/cmd/tui/util/voice.ts index f176c8495e84..4d043a44a49c 100644 --- a/packages/opencode/src/cli/cmd/tui/util/voice.ts +++ b/packages/opencode/src/cli/cmd/tui/util/voice.ts @@ -1,6 +1,7 @@ import { tmpdir } from "os" import path from "path" import { Config } from "@/config/config" +import { Alm } from "@/voice/alm" import { Whisper } from "@/voice/whisper" export type VoiceConfig = { @@ -18,6 +19,15 @@ const defaultCommands = [ const defaultMime = "audio/mpeg" +const resolveType = (voice?: Config.Info["voice"]) => { + if (voice?.type) return voice.type + if (voice?.whisper?.apiKey && !voice?.alm?.apiKey) return "whisper" + if (voice?.alm?.apiKey && !voice?.whisper?.apiKey) return "alm" + if (voice?.whisper?.apiKey) return "whisper" + if (voice?.alm?.apiKey) return "alm" + return "whisper" +} + const pickCommand = (config?: VoiceConfig) => { if (config?.command?.length) return config.command for (const candidate of defaultCommands) { @@ -48,8 +58,10 @@ export namespace Voice { } const isEnabled = () => { - if (!input.transcription?.()?.whisper?.apiKey) return false - return true + const voice = input.transcription?.() + const type = resolveType(voice) + if (type === "alm") return !!voice?.alm?.apiKey + return !!voice?.whisper?.apiKey } const start = async () => { @@ -88,25 +100,47 @@ export namespace Voice { const blob = new Blob([buffer], { type: mime }) const apiFile = new File([blob], "audio.mp3", { type: mime }) - console.log("whisper transcribe start", { - bytes: buffer.byteLength, - url: input.transcription?.()?.whisper?.url, - model: input.transcription?.()?.whisper?.model, - language: input.transcription?.()?.whisper?.language, - }) + const voice = input.transcription?.() + const type = resolveType(voice) + if (type === "alm") { + console.log("voice transcribe start", { + provider: "alm", + bytes: buffer.byteLength, + url: voice?.alm?.url, + model: voice?.alm?.model, + }) + } + if (type === "whisper") { + console.log("voice transcribe start", { + provider: "whisper", + bytes: buffer.byteLength, + url: voice?.whisper?.url, + model: voice?.whisper?.model, + language: voice?.whisper?.language, + }) + } state.cancelled = false state.controller = new AbortController() - const result = await Whisper.transcribe({ - file: apiFile, - mime, - sessionID: input.sessionID?.(), - prompt: input.prompt?.(), - signal: state.controller.signal, - voice: input.transcription?.(), - }) + const result = await (type === "alm" + ? Alm.transcribe({ + file: apiFile, + mime, + sessionID: input.sessionID?.(), + prompt: input.prompt?.(), + signal: state.controller.signal, + voice, + }) + : Whisper.transcribe({ + file: apiFile, + mime, + sessionID: input.sessionID?.(), + prompt: input.prompt?.(), + signal: state.controller.signal, + voice, + })) .then((response) => ({ text: response.text, cancelled: false })) .catch((error) => { - console.log("whisper transcribe failed", { error: String(error) }) + console.log("voice transcribe failed", { error: String(error), provider: type }) if (error?.name === "AbortError" || state.cancelled) return { text: "", cancelled: true } throw error }) diff --git a/packages/opencode/src/config/config.ts b/packages/opencode/src/config/config.ts index 864b69c8eafa..d12b72f879d3 100644 --- a/packages/opencode/src/config/config.ts +++ b/packages/opencode/src/config/config.ts @@ -944,6 +944,7 @@ export namespace Config { export const Voice = z .object({ + type: z.enum(["whisper", "alm"]).optional().describe("Transcription provider type"), whisper: z .object({ url: z.string().optional().describe("Whisper API URL"), @@ -953,6 +954,16 @@ export namespace Config { }) .optional() .describe("Whisper transcription settings"), + alm: z + .object({ + url: z.string().optional().describe("Audio LM API URL"), + apiKey: z.string().optional().describe("Audio LM API key"), + model: z.string().optional().describe("Audio LM model name"), + prompt: z.string().optional().describe("Audio LM base prompt"), + system: z.string().optional().describe("Audio LM system prompt"), + }) + .optional() + .describe("Audio language model transcription settings"), }) .describe("Voice transcription settings") diff --git a/packages/opencode/src/server/routes/voice.ts b/packages/opencode/src/server/routes/voice.ts index f044a4d506fe..548316e49062 100644 --- a/packages/opencode/src/server/routes/voice.ts +++ b/packages/opencode/src/server/routes/voice.ts @@ -1,16 +1,27 @@ import { describeRoute, resolver } from "hono-openapi" import { zValidator } from "@hono/zod-validator" import z from "zod" +import { Config } from "@/config/config" +import { Alm } from "@/voice/alm" import { Whisper } from "@/voice/whisper" import { lazy } from "@/util/lazy" import { Hono } from "hono" +const resolveType = (voice?: Config.Info["voice"]) => { + if (voice?.type) return voice.type + if (voice?.whisper?.apiKey && !voice?.alm?.apiKey) return "whisper" + if (voice?.alm?.apiKey && !voice?.whisper?.apiKey) return "alm" + if (voice?.whisper?.apiKey) return "whisper" + if (voice?.alm?.apiKey) return "alm" + return "whisper" +} + export const VoiceRoutes = lazy(() => new Hono().post( "/transcribe", describeRoute({ summary: "Transcribe audio", - description: "Transcribe an audio file with Whisper", + description: "Transcribe an audio file with Whisper or an audio language model", operationId: "audio.transcribe", responses: { 200: { @@ -34,12 +45,24 @@ export const VoiceRoutes = lazy(() => async (c) => { const data = c.req.valid("form") const file = data.file - const result = await Whisper.transcribe({ - file, - mime: file.type || "audio/wav", - sessionID: data.sessionID, - prompt: data.prompt, - }) + const mime = file.type || "audio/wav" + const voice = (await Config.get()).voice + const type = resolveType(voice) + const result = await (type === "alm" + ? Alm.transcribe({ + file, + mime, + sessionID: data.sessionID, + prompt: data.prompt, + voice, + }) + : Whisper.transcribe({ + file, + mime, + sessionID: data.sessionID, + prompt: data.prompt, + voice, + })) return c.json(result) }, ), diff --git a/packages/opencode/src/voice/alm.ts b/packages/opencode/src/voice/alm.ts new file mode 100644 index 000000000000..de720e88c15f --- /dev/null +++ b/packages/opencode/src/voice/alm.ts @@ -0,0 +1,100 @@ +import z from "zod" +import { Config } from "@/config/config" +import { buildPrompt, getLastAssistantText, toWavOrMp3 } from "@/voice/whisper" + +const buildMessages = (input: { + system?: string + context?: string + audio: string +}) => { + const system = (input.system ?? "You are a professional speech-to-text transcriber. Your task is to transcribe the audio into text.").trim() + const context = input.context?.trim() + const text = context + ? `${system}\n\n${context}\n\nDO NOT answer user's question, just transcribe the audio into text.` + : system + return [ + { + role: "system" as const, + content: text, + }, + { + role: "user" as const, + content: [ + { type: "audio_url", audio_url: { url: input.audio } }, + { type: "text", text: "you are a professional speech to text transcriber, your task is to transcribe the audio into text." }, + ], + }, + ] +} + +export namespace Alm { + export const Request = z.object({ + file: z.instanceof(File), + mime: z.string(), + sessionID: z.string().optional(), + prompt: z.string().optional(), + }) + + export const Response = z.object({ + text: z.string().default(""), + }) + + export type Response = z.infer + + export async function transcribe( + input: z.infer & { signal?: AbortSignal; voice?: Config.Info["voice"] }, + ) { + const voice = input.voice ?? (await Config.get()).voice + const alm = voice?.alm + const apiKey = alm?.apiKey + if (!apiKey) { + throw new Error("Missing voice.alm.apiKey") + } + + const content = await input.file.arrayBuffer() + const prepared = await toWavOrMp3({ buffer: content, mime: input.mime }) + const audio = `data:${prepared.mime};base64,${Buffer.from(prepared.buffer).toString("base64")}` + + const assistant = await getLastAssistantText(input.sessionID) + const context = buildPrompt({ assistant, prompt: buildPrompt({ assistant: alm?.prompt, prompt: input.prompt }) }) + const messages = buildMessages({ + system: alm?.system, + context, + audio, + }) + + const payload = { + model: alm?.model ?? "gpt-4o-mini-transcribe", + messages, + temperature: 0, + } + + const url = alm?.url ?? "https://api.openai.com/v1/chat/completions" + console.log("alm request", { + url, + model: payload.model, + bytes: prepared.buffer.byteLength, + }) + + const result = await fetch(url, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${apiKey}`, + }, + body: JSON.stringify(payload), + signal: input.signal, + }) + + if (!result.ok) { + const message = await result.text().catch(() => "") + throw new Error(message || "ALM request failed") + } + + const body = await result.text().catch(() => "") + console.log("alm response", { body }) + const parsed = body ? JSON.parse(body) : {} + const text = parsed?.choices?.[0]?.message?.content + return Response.parse({ text: typeof text === "string" ? text : "" }) + } +} diff --git a/packages/opencode/src/voice/whisper.ts b/packages/opencode/src/voice/whisper.ts index 8a3c1f91f424..7542f207b70a 100644 --- a/packages/opencode/src/voice/whisper.ts +++ b/packages/opencode/src/voice/whisper.ts @@ -4,7 +4,7 @@ import { tmpdir } from "os" import path from "path" import z from "zod" -const toWavOrMp3 = async (input: { buffer: ArrayBuffer; mime: string }) => { +export const toWavOrMp3 = async (input: { buffer: ArrayBuffer; mime: string }) => { const isWav = input.mime.includes("wav") const isMp3 = input.mime.includes("mpeg") || input.mime.includes("mp3") if (isWav || isMp3) { @@ -47,7 +47,7 @@ const toWavOrMp3 = async (input: { buffer: ArrayBuffer; mime: string }) => { return { buffer, name: "audio.mp3", mime: "audio/mpeg" } } -const getLastAssistantText = async (sessionID?: string) => { +export const getLastAssistantText = async (sessionID?: string) => { if (!sessionID) return "" return Promise.resolve() .then(() => Session.messages({ sessionID, limit: 50 })) @@ -70,7 +70,7 @@ const getLastAssistantText = async (sessionID?: string) => { }) } -const buildPrompt = (input: { prompt?: string; assistant?: string }) => { +export const buildPrompt = (input: { prompt?: string; assistant?: string }) => { const head = input.assistant?.trim() ?? "" const tail = input.prompt?.trim() ?? "" if (!head) return tail diff --git a/packages/web/src/content/docs/config.mdx b/packages/web/src/content/docs/config.mdx index 5433a0c73cba..4228fffe39a3 100644 --- a/packages/web/src/content/docs/config.mdx +++ b/packages/web/src/content/docs/config.mdx @@ -181,12 +181,13 @@ Available options: ### Voice -Configure voice transcription for the Whisper API with the `voice` option. +Configure voice transcription for Whisper or ALM with the `voice` option. ```json title="opencode.json" { "$schema": "https://opencode.ai/config.json", "voice": { + "type": "whisper", "whisper": { "url": "http://127.0.0.1:5000/v1/audio/transcriptions", "apiKey": "{env:OPENCODE_WHISPER_API_KEY}", @@ -197,12 +198,34 @@ Configure voice transcription for the Whisper API with the `voice` option. } ``` +```json title="opencode.json" +{ + "$schema": "https://opencode.ai/config.json", + "voice": { + "type": "alm", + "alm": { + "url": "https://api.openai.com/v1/chat/completions", + "apiKey": "{env:OPENCODE_ALM_API_KEY}", + "model": "gpt-4o-mini-transcribe", + "system": "You are a professional speech-to-text transcriber. Your task is to transcribe the audio into text.", + "prompt": "Keep technical terms unchanged." + } + } +} +``` + Available options: +- `type` - Transcription provider (`whisper` or `alm`). - `whisper.url` - Whisper transcription endpoint URL. - `whisper.apiKey` - API key for the Whisper service. - `whisper.model` - Whisper model name (default: `whisper-1`). - `whisper.language` - Optional language hint (e.g. `en`). +- `alm.url` - Audio LM transcription endpoint URL. +- `alm.apiKey` - API key for the ALM service. +- `alm.model` - Audio LM model name. +- `alm.prompt` - Optional base prompt for transcription. +- `alm.system` - Optional system prompt for transcription. --- From 8381836ddaf915f53b44f9eda970eba8d717bc3d Mon Sep 17 00:00:00 2001 From: heimoshuiyu Date: Thu, 5 Feb 2026 15:45:29 +0800 Subject: [PATCH 3/7] Add web deploy skill and configurable web proxy --- .opencode/skill/web-s3-deploy/SKILL.md | 30 ++++++++++++++++++++++++++ packages/opencode/src/flag/flag.ts | 1 + packages/opencode/src/server/server.ts | 7 +++--- 3 files changed, 35 insertions(+), 3 deletions(-) create mode 100644 .opencode/skill/web-s3-deploy/SKILL.md diff --git a/.opencode/skill/web-s3-deploy/SKILL.md b/.opencode/skill/web-s3-deploy/SKILL.md new file mode 100644 index 000000000000..e4e5cc585590 --- /dev/null +++ b/.opencode/skill/web-s3-deploy/SKILL.md @@ -0,0 +1,30 @@ +--- +name: web-s3-deploy +description: Build the web frontend, sync to S3, and invalidate CloudFront +--- + +## What I do +Provide a repeatable workflow to publish the web frontend to a public S3 bucket and refresh a CloudFront distribution so HTTPS updates are visible. + +## When to use me +Use this when you need to ship a new web UI build for OpenCode and make sure CloudFront serves the latest assets. + +## Checklist +1. Build the frontend locally. +2. Sync the build output to the S3 bucket. +3. Trigger a CloudFront invalidation to refresh cached assets. + +## Commands +```bash +bun run --cwd packages/app build +aws s3 sync packages/app/dist s3://opencode-hmsy --delete --exact-timestamps +aws cloudfront create-invalidation --distribution-id E30UYS44QZ0UX4 --paths "/*" +``` + +## Notes +- S3 website URL: http://opencode-hmsy.s3-website-ap-southeast-1.amazonaws.com +- CloudFront HTTPS URL: https://d3ir6x3lfy3u68.cloudfront.net +- OPENCODE_WEB_URL=https://d3ir6x3lfy3u68.cloudfront.net +- For S3 website hosting, ensure the bucket policy allows public read. +- The CloudFront distribution should use the S3 website endpoint as its origin for SPA routing. +- If you only need cache refresh after content changes, you can skip the build step. diff --git a/packages/opencode/src/flag/flag.ts b/packages/opencode/src/flag/flag.ts index b11058b34058..55d94390f1f3 100644 --- a/packages/opencode/src/flag/flag.ts +++ b/packages/opencode/src/flag/flag.ts @@ -30,6 +30,7 @@ export namespace Flag { export declare const OPENCODE_CLIENT: string export const OPENCODE_SERVER_PASSWORD = process.env["OPENCODE_SERVER_PASSWORD"] export const OPENCODE_SERVER_USERNAME = process.env["OPENCODE_SERVER_USERNAME"] + export const OPENCODE_WEB_URL = process.env["OPENCODE_WEB_URL"] // Experimental export const OPENCODE_EXPERIMENTAL = truthy("OPENCODE_EXPERIMENTAL") diff --git a/packages/opencode/src/server/server.ts b/packages/opencode/src/server/server.ts index e01aa4276448..0875f808f4ce 100644 --- a/packages/opencode/src/server/server.ts +++ b/packages/opencode/src/server/server.ts @@ -534,12 +534,13 @@ export namespace Server { ) .all("/*", async (c) => { const path = c.req.path - - const response = await proxy(`https://app.opencode.ai${path}`, { + // Fork override: default web URL points to personal CloudFront; upstream default was https://app.opencode.ai + const target = Flag.OPENCODE_WEB_URL ?? "https://d3ir6x3lfy3u68.cloudfront.net" + const response = await proxy(`${target}${path}`, { ...c.req, headers: { ...c.req.raw.headers, - host: "app.opencode.ai", + host: new URL(target).host, }, }) response.headers.set( From a5f2fe2a489b039748cba0cd3bb51f81f6a600f1 Mon Sep 17 00:00:00 2001 From: heimoshuiyu Date: Mon, 19 Jan 2026 02:25:46 +0800 Subject: [PATCH 4/7] Update default Whisper URL --- packages/opencode/src/voice/whisper.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/opencode/src/voice/whisper.ts b/packages/opencode/src/voice/whisper.ts index 7542f207b70a..b9a7fa77ac8b 100644 --- a/packages/opencode/src/voice/whisper.ts +++ b/packages/opencode/src/voice/whisper.ts @@ -122,7 +122,7 @@ export namespace Whisper { form.append("prompt", prompt) } - const url = whisper?.url ?? "http://127.0.0.1:5000/v1/audio/transcriptions" + const url = whisper?.url ?? "https://api.openai.com/v1/audio/transcriptions" console.log("whisper request", { url, model: whisper?.model ?? "whisper-1", From cfbe583465ab7001d7d4f1563f1343bc7c1e1029 Mon Sep 17 00:00:00 2001 From: heimoshuiyu Date: Thu, 22 Jan 2026 18:28:19 +0800 Subject: [PATCH 5/7] feat: show spinner while transcribing --- packages/app/src/components/prompt-input.tsx | 21 +++++++++++++------- packages/ui/src/components/icon-button.tsx | 7 ++++--- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/packages/app/src/components/prompt-input.tsx b/packages/app/src/components/prompt-input.tsx index 198460c9323a..42d2824896ed 100644 --- a/packages/app/src/components/prompt-input.tsx +++ b/packages/app/src/components/prompt-input.tsx @@ -38,6 +38,7 @@ import { ProviderIcon } from "@opencode-ai/ui/provider-icon" import type { IconName } from "@opencode-ai/ui/icons/provider" import { Tooltip, TooltipKeybind } from "@opencode-ai/ui/tooltip" import { IconButton } from "@opencode-ai/ui/icon-button" +import { Spinner } from "@opencode-ai/ui/spinner" import { Select } from "@opencode-ai/ui/select" import { getDirectory, getFilename, getFilenameTruncated } from "@opencode-ai/util/path" import { useDialog } from "@opencode-ai/ui/context/dialog" @@ -2265,13 +2266,19 @@ export const PromptInput: Component = (props) => { - + { @@ -10,7 +10,8 @@ export interface IconButtonProps extends ComponentProps { } export function IconButton(props: ComponentProps<"button"> & IconButtonProps) { - const [split, rest] = splitProps(props, ["variant", "size", "iconSize", "class", "classList"]) + const [split, rest] = splitProps(props, ["variant", "size", "iconSize", "class", "classList", "children"]) + const content = children(() => split.children) return ( & IconButtonProps) { [split.class ?? ""]: !!split.class, }} > - + {content() ?? } ) } From bcf0765314f237bf6a725370f902f509a4578c77 Mon Sep 17 00:00:00 2001 From: heimoshuiyu Date: Wed, 21 Jan 2026 01:45:03 +0800 Subject: [PATCH 6/7] Fix voice input insertion by ensuring selection is in editor When voice transcription completes, addPart now checks if the current selection is within the prompt editor. If the selection is outside the editor (e.g., user clicked on an assistant message during recording), it focuses the editor and restores the cursor position from prompt.cursor() before inserting the transcribed text. This prevents transcription results from being inserted into unintended locations like assistant messages. Also fixes cursor position logic to prefer real DOM position when selection is inside the editor, only falling back to prompt.cursor() when selection is outside. --- packages/app/src/components/prompt-input.tsx | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/packages/app/src/components/prompt-input.tsx b/packages/app/src/components/prompt-input.tsx index 42d2824896ed..9c41c25e8832 100644 --- a/packages/app/src/components/prompt-input.tsx +++ b/packages/app/src/components/prompt-input.tsx @@ -1071,9 +1071,19 @@ export const PromptInput: Component = (props) => { const addPart = (part: ContentPart) => { const selection = window.getSelection() - if (!selection || selection.rangeCount === 0) return + if (!selection) return + + const hasRange = selection.rangeCount > 0 + const inEditor = hasRange && editorRef.contains(selection.anchorNode) + const cursorPosition = inEditor + ? getCursorPosition(editorRef) + : (prompt.cursor() ?? getCursorPosition(editorRef)) + if (!inEditor) { + editorRef.focus() + setCursorPosition(editorRef, cursorPosition) + } + if (selection.rangeCount === 0) return - const cursorPosition = getCursorPosition(editorRef) const currentPrompt = prompt.current() const rawText = currentPrompt.map((p) => ("content" in p ? p.content : "")).join("") const textBeforeCursor = rawText.substring(0, cursorPosition) From 675d657c3d4929404a1933752f679ee9083e984c Mon Sep 17 00:00:00 2001 From: heimoshuiyu Date: Thu, 22 Jan 2026 19:16:51 +0800 Subject: [PATCH 7/7] fix(tui): show warning toast when clicking disabled voice button --- .../src/cli/cmd/tui/component/prompt/index.tsx | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/packages/opencode/src/cli/cmd/tui/component/prompt/index.tsx b/packages/opencode/src/cli/cmd/tui/component/prompt/index.tsx index ec45e0feab67..2b7a4dd6d856 100644 --- a/packages/opencode/src/cli/cmd/tui/component/prompt/index.tsx +++ b/packages/opencode/src/cli/cmd/tui/component/prompt/index.tsx @@ -1110,13 +1110,19 @@ export function Prompt(props: PromptProps) { - { - if (!voiceEnabled() && !store.recording && !store.processing) return + { + if (!voiceEnabled() && !store.recording && !store.processing) { + toast.show({ + message: "Voice input unavailable (missing transcription API key)", + variant: "warning", + }) + return + } await toggleVoice() - }} - > + }} + > {voiceLabel()}