diff --git a/.opencode/skill/web-s3-deploy/SKILL.md b/.opencode/skill/web-s3-deploy/SKILL.md new file mode 100644 index 000000000000..e4e5cc585590 --- /dev/null +++ b/.opencode/skill/web-s3-deploy/SKILL.md @@ -0,0 +1,30 @@ +--- +name: web-s3-deploy +description: Build the web frontend, sync to S3, and invalidate CloudFront +--- + +## What I do +Provide a repeatable workflow to publish the web frontend to a public S3 bucket and refresh a CloudFront distribution so HTTPS updates are visible. + +## When to use me +Use this when you need to ship a new web UI build for OpenCode and make sure CloudFront serves the latest assets. + +## Checklist +1. Build the frontend locally. +2. Sync the build output to the S3 bucket. +3. Trigger a CloudFront invalidation to refresh cached assets. + +## Commands +```bash +bun run --cwd packages/app build +aws s3 sync packages/app/dist s3://opencode-hmsy --delete --exact-timestamps +aws cloudfront create-invalidation --distribution-id E30UYS44QZ0UX4 --paths "/*" +``` + +## Notes +- S3 website URL: http://opencode-hmsy.s3-website-ap-southeast-1.amazonaws.com +- CloudFront HTTPS URL: https://d3ir6x3lfy3u68.cloudfront.net +- OPENCODE_WEB_URL=https://d3ir6x3lfy3u68.cloudfront.net +- For S3 website hosting, ensure the bucket policy allows public read. +- The CloudFront distribution should use the S3 website endpoint as its origin for SPA routing. +- If you only need cache refresh after content changes, you can skip the build step. diff --git a/packages/app/src/components/prompt-input.tsx b/packages/app/src/components/prompt-input.tsx index b897e394aa18..9c41c25e8832 100644 --- a/packages/app/src/components/prompt-input.tsx +++ b/packages/app/src/components/prompt-input.tsx @@ -38,6 +38,7 @@ import { ProviderIcon } from "@opencode-ai/ui/provider-icon" import type { IconName } from "@opencode-ai/ui/icons/provider" import { Tooltip, TooltipKeybind } from "@opencode-ai/ui/tooltip" import { IconButton } from "@opencode-ai/ui/icon-button" +import { Spinner } from "@opencode-ai/ui/spinner" import { Select } from "@opencode-ai/ui/select" import { getDirectory, getFilename, getFilenameTruncated } from "@opencode-ai/util/path" import { useDialog } from "@opencode-ai/ui/context/dialog" @@ -251,6 +252,16 @@ export const PromptInput: Component = (props) => { applyingHistory: false, }) + const [recording, setRecording] = createSignal(false) + const [transcribing, setTranscribing] = createSignal(false) + const audio = { + recorder: undefined as MediaRecorder | undefined, + stream: undefined as MediaStream | undefined, + controller: undefined as AbortController | undefined, + chunks: [] as Blob[], + mime: "", + } + const MAX_HISTORY = 100 const [history, setHistory] = persisted( Persist.global("prompt-history", ["prompt-history.v1"]), @@ -384,6 +395,204 @@ export const PromptInput: Component = (props) => { addPart({ type: "text", content: plainText, start: 0, end: 0 }) } + const isVoiceSupported = () => + typeof navigator !== "undefined" && + typeof window !== "undefined" && + Boolean(navigator.mediaDevices?.getUserMedia) && + typeof MediaRecorder !== "undefined" + + const stopStream = () => { + audio.stream?.getTracks().forEach((track) => track.stop()) + audio.stream = undefined + } + + const recordStart = async () => { + if (!isVoiceSupported()) { + showToast({ + title: "Voice input unavailable", + description: "Your browser does not support audio recording.", + }) + return false + } + if (audio.recorder) return false + + const stream = await navigator.mediaDevices + .getUserMedia({ audio: true }) + .catch(() => undefined) + if (!stream) { + showToast({ + title: "Microphone blocked", + description: "Allow microphone access to start recording.", + }) + return false + } + + // ensure we can clean up stream even if mime unsupported + audio.stream = stream + + const preferred = "audio/webm;codecs=opus" + const fallback = "audio/webm" + const mime = MediaRecorder.isTypeSupported(preferred) + ? preferred + : MediaRecorder.isTypeSupported(fallback) + ? fallback + : "" + if (!mime) { + stopStream() + showToast({ + title: "Voice input unavailable", + description: "This browser does not support the available audio formats.", + }) + return false + } + const recorder = new MediaRecorder(stream, { mimeType: mime }) + + audio.mime = recorder.mimeType || mime + audio.chunks = [] + audio.recorder = recorder + + recorder.ondataavailable = (event) => { + if (event.data.size === 0) return + audio.chunks.push(event.data) + } + + recorder.start() + setRecording(true) + return true + } + + const recordStop = async () => { + if (!audio.recorder) return + const recorder = audio.recorder + audio.recorder = undefined + + const result = new Promise((resolve) => { + recorder.onstop = () => { + resolve(new Blob(audio.chunks, { type: audio.mime || "audio/webm" })) + } + }) + + recorder.stop() + const blob = await result + stopStream() + setRecording(false) + return blob + } + + const transcribeAudio = async (blob: Blob) => { + if (!blob.size) { + showToast({ + title: "No audio captured", + description: "Try recording again.", + }) + return + } + + const mime = blob.type || "audio/webm" + const filename = mime.includes("webm") ? "audio.webm" : "audio.dat" + const file = new File([blob], filename, { type: mime }) + const form = new FormData() + const currentPrompt = prompt.current() + const promptText = currentPrompt.map((part) => ("content" in part ? part.content : "")).join("") + form.append("file", file) + if (params.id) { + form.append("sessionID", params.id) + } + if (promptText.trim()) { + form.append("prompt", promptText) + } + + const fetcher = platform.fetch ?? fetch + const controller = new AbortController() + audio.controller = controller + setTranscribing(true) + const response = await fetcher(`${sdk.url}/voice/transcribe`, { + method: "POST", + body: form, + signal: controller.signal, + }).catch(() => undefined) + + audio.controller = undefined + + if (!response) { + setTranscribing(false) + if (controller.signal.aborted) return + showToast({ + title: "Transcription failed", + description: "Failed to reach the server.", + }) + return + } + + const payload = await response.json().catch(() => ({ text: "" })) + const text = typeof payload?.text === "string" ? payload.text : "" + setTranscribing(false) + + if (!response.ok) { + if (controller.signal.aborted) return + showToast({ + title: "Transcription failed", + description: text || "Request failed.", + }) + return + } + + if (!text.trim()) { + showToast({ + title: "No speech detected", + description: "Try speaking closer to the microphone.", + }) + return + } + + addPart({ type: "text", content: text, start: 0, end: 0 }) + requestAnimationFrame(() => { + editorRef.focus() + queueScroll() + }) + } + + const toggleVoice = async () => { + if (transcribing()) { + const controller = audio.controller + if (controller) { + controller.abort() + setTranscribing(false) + showToast({ + title: "Transcription cancelled", + description: "Stopped the current transcription.", + }) + } + return + } + + if (recording()) { + const blob = await recordStop() + if (!blob) return + await transcribeAudio(blob) + return + } + + await recordStart() + } + + const voiceTitle = createMemo(() => + transcribing() ? "Cancel transcription" : recording() ? "Stop recording" : "Voice input", + ) + + command.register(() => [ + { + id: "prompt.voice", + title: "Voice input", + description: "Start or stop voice recording", + category: "Prompt", + keybind: "mod+shift+m", + onSelect: () => { + void toggleVoice() + }, + }, + ]) + const handleGlobalDragOver = (event: DragEvent) => { if (dialog.active) return @@ -428,6 +637,13 @@ export const PromptInput: Component = (props) => { document.removeEventListener("dragover", handleGlobalDragOver) document.removeEventListener("dragleave", handleGlobalDragLeave) document.removeEventListener("drop", handleGlobalDrop) + if (transcribing()) { + const controller = audio.controller + if (controller) controller.abort() + setTranscribing(false) + } + if (!recording()) return + void recordStop() }) createEffect(() => { @@ -855,9 +1071,19 @@ export const PromptInput: Component = (props) => { const addPart = (part: ContentPart) => { const selection = window.getSelection() - if (!selection || selection.rangeCount === 0) return + if (!selection) return + + const hasRange = selection.rangeCount > 0 + const inEditor = hasRange && editorRef.contains(selection.anchorNode) + const cursorPosition = inEditor + ? getCursorPosition(editorRef) + : (prompt.cursor() ?? getCursorPosition(editorRef)) + if (!inEditor) { + editorRef.focus() + setCursorPosition(editorRef, cursorPosition) + } + if (selection.rangeCount === 0) return - const cursorPosition = getCursorPosition(editorRef) const currentPrompt = prompt.current() const rawText = currentPrompt.map((p) => ("content" in p ? p.content : "")).join("") const textBeforeCursor = rawText.substring(0, cursorPosition) @@ -2049,6 +2275,21 @@ export const PromptInput: Component = (props) => { + + + sync.data.config.tui?.voice) + const voice = Voice.create({ + config: voiceConfig, + transcription: () => sync.data.config.voice, + sessionID: () => props.sessionID, + prompt: () => store.prompt.input, + }) const fileStyleId = syntax().getStyleId("extmark.file")! const agentStyleId = syntax().getStyleId("extmark.agent")! @@ -123,6 +131,8 @@ export function Prompt(props: PromptProps) { extmarkToPartIndex: Map interrupt: number placeholder: number + recording: boolean + processing: boolean }>({ placeholder: Math.floor(Math.random() * PLACEHOLDERS.length), prompt: { @@ -132,6 +142,8 @@ export function Prompt(props: PromptProps) { mode: "normal", extmarkToPartIndex: new Map(), interrupt: 0, + recording: false, + processing: false, }) // Initialize agent/model/variant from last user message when session changes @@ -180,6 +192,16 @@ export function Prompt(props: PromptProps) { dialog.clear() }, }, + { + title: "Voice input", + value: "prompt.voice", + disabled: true, + keybind: "input_voice", + category: "Prompt", + onSelect: async () => { + await toggleVoice() + }, + }, { title: "Paste", value: "prompt.paste", @@ -680,6 +702,84 @@ export function Prompt(props: PromptProps) { ) } + async function toggleVoice() { + if (store.processing) { + const cancelled = voice.cancel() + if (cancelled) { + setStore("processing", false) + toast.show({ + message: "Transcription cancelled", + variant: "info", + duration: 1500, + }) + } + return + } + + if (store.recording) { + setStore("recording", false) + setStore("processing", true) + const result = await voice.stop().catch((error) => { + toast.error(error) + return undefined + }) + setStore("processing", false) + if (result?.cancelled) return + if (!result) { + toast.show({ + message: "Recording failed (empty audio)", + variant: "warning", + }) + return + } + if (!result.text.trim()) { + toast.show({ + message: "No speech detected (transcription returned empty text)", + variant: "warning", + }) + return + } + + input.insertText(result.text) + input.getLayoutNode().markDirty() + input.gotoBufferEnd() + renderer.requestRender() + return + } + + const enabled = voice.isEnabled() + if (!enabled) { + toast.show({ + message: "Voice input unavailable (missing transcription API key)", + variant: "warning", + }) + return + } + + setStore("recording", true) + toast.show({ + message: "Recording... press keybind again to stop", + variant: "info", + duration: 2000, + }) + const ok = await voice.start().catch((error) => { + toast.error(error) + return false + }) + if (ok) return + setStore("recording", false) + toast.show({ + message: "Failed to start recording", + variant: "error", + }) + } + + onCleanup(() => { + if (store.processing) voice.cancel() + if (!store.recording) return + voice.stop().catch(() => {}) + }) + async function pasteImage(file: { filename?: string; content: string; mime: string }) { const currentOffset = input.visualCursor.offset const extmarkStart = currentOffset @@ -736,6 +836,19 @@ export function Prompt(props: PromptProps) { return !!current }) + const voiceEnabled = createMemo(() => voice.isEnabled()) + const voiceLabel = createMemo(() => { + if (store.processing) return "Transcribing" + if (store.recording) return "Stop" + return "Record" + }) + const voiceColor = createMemo(() => { + if (store.processing) return theme.warning + if (store.recording) return theme.warning + if (!voiceEnabled()) return theme.textMuted + return theme.text + }) + const spinnerDef = createMemo(() => { const color = local.agent.color(local.agent.current().name) return { @@ -831,6 +944,11 @@ export function Prompt(props: PromptProps) { } // If no image, let the default paste behavior continue } + if (keybind.match("input_voice", e)) { + e.preventDefault() + await toggleVoice() + return + } if (keybind.match("input_clear", e) && store.prompt.input !== "") { input.clear() input.extmarks.clear() @@ -991,6 +1109,22 @@ export function Prompt(props: PromptProps) { + + { + if (!voiceEnabled() && !store.recording && !store.processing) { + toast.show({ + message: "Voice input unavailable (missing transcription API key)", + variant: "warning", + }) + return + } + await toggleVoice() + }} + > + {voiceLabel()} + diff --git a/packages/opencode/src/cli/cmd/tui/util/voice.ts b/packages/opencode/src/cli/cmd/tui/util/voice.ts new file mode 100644 index 000000000000..4d043a44a49c --- /dev/null +++ b/packages/opencode/src/cli/cmd/tui/util/voice.ts @@ -0,0 +1,166 @@ +import { tmpdir } from "os" +import path from "path" +import { Config } from "@/config/config" +import { Alm } from "@/voice/alm" +import { Whisper } from "@/voice/whisper" + +export type VoiceConfig = { + command?: string[] + mime?: string +} + +const defaultCommands = [ + ["ffmpeg", "-y", "-f", "pulse", "-i", "default", "-ac", "1", "-ar", "16000", "-f", "mp3", "{output}"], + ["ffmpeg", "-y", "-f", "alsa", "-i", "default", "-ac", "1", "-ar", "16000", "-f", "mp3", "{output}"], + ["sox", "-d", "-c", "1", "-r", "16000", "{output}"], + ["rec", "-c", "1", "-r", "16000", "{output}"], + ["arecord", "-f", "S16_LE", "-c", "1", "-r", "16000", "{output}"], +] + +const defaultMime = "audio/mpeg" + +const resolveType = (voice?: Config.Info["voice"]) => { + if (voice?.type) return voice.type + if (voice?.whisper?.apiKey && !voice?.alm?.apiKey) return "whisper" + if (voice?.alm?.apiKey && !voice?.whisper?.apiKey) return "alm" + if (voice?.whisper?.apiKey) return "whisper" + if (voice?.alm?.apiKey) return "alm" + return "whisper" +} + +const pickCommand = (config?: VoiceConfig) => { + if (config?.command?.length) return config.command + for (const candidate of defaultCommands) { + const bin = candidate[0] + if (!bin) continue + if (Bun.which(bin)) return candidate + } + return undefined +} + +const readStream = async (stream?: ReadableStream | number | null) => { + if (!stream || typeof stream === "number") return "" + return new Response(stream).text().catch(() => "") +} + +export namespace Voice { + export function create(input: { + config: () => VoiceConfig | undefined + transcription?: () => Config.Info["voice"] | undefined + sessionID?: () => string | undefined + prompt?: () => string | undefined + }) { + const state = { + proc: undefined as ReturnType | undefined, + output: undefined as string | undefined, + controller: undefined as AbortController | undefined, + cancelled: false, + } + + const isEnabled = () => { + const voice = input.transcription?.() + const type = resolveType(voice) + if (type === "alm") return !!voice?.alm?.apiKey + return !!voice?.whisper?.apiKey + } + + const start = async () => { + if (state.proc) return false + const config = input.config() + const command = pickCommand(config) + if (!command) return false + state.output = path.join(tmpdir(), `opencode-voice-${crypto.randomUUID()}.mp3`) + const args = command.map((entry) => entry.replaceAll("{output}", state.output!)) + state.proc = Bun.spawn(args, { stdout: "pipe", stderr: "pipe" }) + console.log("voice recorder started", { args, output: state.output }) + await Bun.sleep(100) + return true + } + + const stop = async () => { + if (!state.proc || !state.output) return + const target = state.proc + state.proc = undefined + const pathResult = state.output + state.output = undefined + target.kill() + await target.exited.catch(() => {}) + + const stdout = await readStream(target.stdout) + const stderr = await readStream(target.stderr) + if (stdout || stderr) { + console.log("voice recorder output", { stdout, stderr }) + } + + const mime = input.config()?.mime ?? defaultMime + const buffer = await Bun.file(pathResult).arrayBuffer().catch(() => undefined) + console.log("voice recorder bytes", { bytes: buffer?.byteLength ?? 0 }) + await Bun.file(pathResult).delete().catch(() => {}) + if (!buffer) return + + const blob = new Blob([buffer], { type: mime }) + const apiFile = new File([blob], "audio.mp3", { type: mime }) + const voice = input.transcription?.() + const type = resolveType(voice) + if (type === "alm") { + console.log("voice transcribe start", { + provider: "alm", + bytes: buffer.byteLength, + url: voice?.alm?.url, + model: voice?.alm?.model, + }) + } + if (type === "whisper") { + console.log("voice transcribe start", { + provider: "whisper", + bytes: buffer.byteLength, + url: voice?.whisper?.url, + model: voice?.whisper?.model, + language: voice?.whisper?.language, + }) + } + state.cancelled = false + state.controller = new AbortController() + const result = await (type === "alm" + ? Alm.transcribe({ + file: apiFile, + mime, + sessionID: input.sessionID?.(), + prompt: input.prompt?.(), + signal: state.controller.signal, + voice, + }) + : Whisper.transcribe({ + file: apiFile, + mime, + sessionID: input.sessionID?.(), + prompt: input.prompt?.(), + signal: state.controller.signal, + voice, + })) + .then((response) => ({ text: response.text, cancelled: false })) + .catch((error) => { + console.log("voice transcribe failed", { error: String(error), provider: type }) + if (error?.name === "AbortError" || state.cancelled) return { text: "", cancelled: true } + throw error + }) + state.controller = undefined + if (!result) return + return result + } + + const cancel = () => { + if (!state.controller) return false + state.cancelled = true + state.controller.abort() + return true + } + + return { + isEnabled, + start, + stop, + cancel, + } + } +} diff --git a/packages/opencode/src/config/config.ts b/packages/opencode/src/config/config.ts index dfb86dbe26f3..d12b72f879d3 100644 --- a/packages/opencode/src/config/config.ts +++ b/packages/opencode/src/config/config.ts @@ -823,6 +823,7 @@ export namespace Config { variant_cycle: z.string().optional().default("ctrl+t").describe("Cycle model variants"), input_clear: z.string().optional().default("ctrl+c").describe("Clear input field"), input_paste: z.string().optional().default("ctrl+v").describe("Paste from clipboard"), + input_voice: z.string().optional().default("v").describe("Toggle voice input"), input_submit: z.string().optional().default("return").describe("Submit input"), input_newline: z .string() @@ -929,8 +930,43 @@ export namespace Config { .enum(["auto", "stacked"]) .optional() .describe("Control diff rendering style: 'auto' adapts to terminal width, 'stacked' always shows single column"), + voice: z + .object({ + command: z + .array(z.string()) + .optional() + .describe("Recorder command template with {output} placeholder"), + mime: z.string().optional().describe("Recorded audio mime type"), + }) + .optional() + .describe("Voice input settings"), }) + export const Voice = z + .object({ + type: z.enum(["whisper", "alm"]).optional().describe("Transcription provider type"), + whisper: z + .object({ + url: z.string().optional().describe("Whisper API URL"), + apiKey: z.string().optional().describe("Whisper API key"), + model: z.string().optional().describe("Whisper model name"), + language: z.string().optional().describe("Whisper language code"), + }) + .optional() + .describe("Whisper transcription settings"), + alm: z + .object({ + url: z.string().optional().describe("Audio LM API URL"), + apiKey: z.string().optional().describe("Audio LM API key"), + model: z.string().optional().describe("Audio LM model name"), + prompt: z.string().optional().describe("Audio LM base prompt"), + system: z.string().optional().describe("Audio LM system prompt"), + }) + .optional() + .describe("Audio language model transcription settings"), + }) + .describe("Voice transcription settings") + export const Server = z .object({ port: z.number().int().positive().optional().describe("Port to listen on"), @@ -1009,6 +1045,7 @@ export namespace Config { keybinds: Keybinds.optional().describe("Custom keybind configurations"), logLevel: Log.Level.optional().describe("Log level"), tui: TUI.optional().describe("TUI specific settings"), + voice: Voice.optional().describe("Voice transcription settings"), server: Server.optional().describe("Server configuration for opencode serve and web commands"), command: z .record(z.string(), Command) diff --git a/packages/opencode/src/flag/flag.ts b/packages/opencode/src/flag/flag.ts index b11058b34058..55d94390f1f3 100644 --- a/packages/opencode/src/flag/flag.ts +++ b/packages/opencode/src/flag/flag.ts @@ -30,6 +30,7 @@ export namespace Flag { export declare const OPENCODE_CLIENT: string export const OPENCODE_SERVER_PASSWORD = process.env["OPENCODE_SERVER_PASSWORD"] export const OPENCODE_SERVER_USERNAME = process.env["OPENCODE_SERVER_USERNAME"] + export const OPENCODE_WEB_URL = process.env["OPENCODE_WEB_URL"] // Experimental export const OPENCODE_EXPERIMENTAL = truthy("OPENCODE_EXPERIMENTAL") diff --git a/packages/opencode/src/server/routes/voice.ts b/packages/opencode/src/server/routes/voice.ts new file mode 100644 index 000000000000..548316e49062 --- /dev/null +++ b/packages/opencode/src/server/routes/voice.ts @@ -0,0 +1,69 @@ +import { describeRoute, resolver } from "hono-openapi" +import { zValidator } from "@hono/zod-validator" +import z from "zod" +import { Config } from "@/config/config" +import { Alm } from "@/voice/alm" +import { Whisper } from "@/voice/whisper" +import { lazy } from "@/util/lazy" +import { Hono } from "hono" + +const resolveType = (voice?: Config.Info["voice"]) => { + if (voice?.type) return voice.type + if (voice?.whisper?.apiKey && !voice?.alm?.apiKey) return "whisper" + if (voice?.alm?.apiKey && !voice?.whisper?.apiKey) return "alm" + if (voice?.whisper?.apiKey) return "whisper" + if (voice?.alm?.apiKey) return "alm" + return "whisper" +} + +export const VoiceRoutes = lazy(() => + new Hono().post( + "/transcribe", + describeRoute({ + summary: "Transcribe audio", + description: "Transcribe an audio file with Whisper or an audio language model", + operationId: "audio.transcribe", + responses: { + 200: { + description: "Transcription result", + content: { + "application/json": { + schema: resolver(Whisper.Response), + }, + }, + }, + }, + }), + zValidator( + "form", + z.object({ + file: z.instanceof(File), + sessionID: z.string().optional(), + prompt: z.string().optional(), + }), + ), + async (c) => { + const data = c.req.valid("form") + const file = data.file + const mime = file.type || "audio/wav" + const voice = (await Config.get()).voice + const type = resolveType(voice) + const result = await (type === "alm" + ? Alm.transcribe({ + file, + mime, + sessionID: data.sessionID, + prompt: data.prompt, + voice, + }) + : Whisper.transcribe({ + file, + mime, + sessionID: data.sessionID, + prompt: data.prompt, + voice, + })) + return c.json(result) + }, + ), +) diff --git a/packages/opencode/src/server/server.ts b/packages/opencode/src/server/server.ts index 015553802a47..0875f808f4ce 100644 --- a/packages/opencode/src/server/server.ts +++ b/packages/opencode/src/server/server.ts @@ -40,6 +40,7 @@ import { QuestionRoutes } from "./routes/question" import { PermissionRoutes } from "./routes/permission" import { GlobalRoutes } from "./routes/global" import { MDNS } from "./mdns" +import { VoiceRoutes } from "./routes/voice" // @ts-ignore This global is needed to prevent ai-sdk from logging warnings to stdout https://github.com/vercel/ai/blob/2dc67e0ef538307f21368db32d5a12345d98831b/packages/ai/src/logger/log-warnings.ts#L85 globalThis.AI_SDK_LOG_WARNINGS = false @@ -224,6 +225,7 @@ export namespace Server { .route("/permission", PermissionRoutes()) .route("/question", QuestionRoutes()) .route("/provider", ProviderRoutes()) + .route("/voice", VoiceRoutes()) .route("/", FileRoutes()) .route("/mcp", McpRoutes()) .route("/tui", TuiRoutes()) @@ -532,12 +534,13 @@ export namespace Server { ) .all("/*", async (c) => { const path = c.req.path - - const response = await proxy(`https://app.opencode.ai${path}`, { + // Fork override: default web URL points to personal CloudFront; upstream default was https://app.opencode.ai + const target = Flag.OPENCODE_WEB_URL ?? "https://d3ir6x3lfy3u68.cloudfront.net" + const response = await proxy(`${target}${path}`, { ...c.req, headers: { ...c.req.raw.headers, - host: "app.opencode.ai", + host: new URL(target).host, }, }) response.headers.set( diff --git a/packages/opencode/src/voice/alm.ts b/packages/opencode/src/voice/alm.ts new file mode 100644 index 000000000000..de720e88c15f --- /dev/null +++ b/packages/opencode/src/voice/alm.ts @@ -0,0 +1,100 @@ +import z from "zod" +import { Config } from "@/config/config" +import { buildPrompt, getLastAssistantText, toWavOrMp3 } from "@/voice/whisper" + +const buildMessages = (input: { + system?: string + context?: string + audio: string +}) => { + const system = (input.system ?? "You are a professional speech-to-text transcriber. Your task is to transcribe the audio into text.").trim() + const context = input.context?.trim() + const text = context + ? `${system}\n\n${context}\n\nDO NOT answer user's question, just transcribe the audio into text.` + : system + return [ + { + role: "system" as const, + content: text, + }, + { + role: "user" as const, + content: [ + { type: "audio_url", audio_url: { url: input.audio } }, + { type: "text", text: "you are a professional speech to text transcriber, your task is to transcribe the audio into text." }, + ], + }, + ] +} + +export namespace Alm { + export const Request = z.object({ + file: z.instanceof(File), + mime: z.string(), + sessionID: z.string().optional(), + prompt: z.string().optional(), + }) + + export const Response = z.object({ + text: z.string().default(""), + }) + + export type Response = z.infer + + export async function transcribe( + input: z.infer & { signal?: AbortSignal; voice?: Config.Info["voice"] }, + ) { + const voice = input.voice ?? (await Config.get()).voice + const alm = voice?.alm + const apiKey = alm?.apiKey + if (!apiKey) { + throw new Error("Missing voice.alm.apiKey") + } + + const content = await input.file.arrayBuffer() + const prepared = await toWavOrMp3({ buffer: content, mime: input.mime }) + const audio = `data:${prepared.mime};base64,${Buffer.from(prepared.buffer).toString("base64")}` + + const assistant = await getLastAssistantText(input.sessionID) + const context = buildPrompt({ assistant, prompt: buildPrompt({ assistant: alm?.prompt, prompt: input.prompt }) }) + const messages = buildMessages({ + system: alm?.system, + context, + audio, + }) + + const payload = { + model: alm?.model ?? "gpt-4o-mini-transcribe", + messages, + temperature: 0, + } + + const url = alm?.url ?? "https://api.openai.com/v1/chat/completions" + console.log("alm request", { + url, + model: payload.model, + bytes: prepared.buffer.byteLength, + }) + + const result = await fetch(url, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${apiKey}`, + }, + body: JSON.stringify(payload), + signal: input.signal, + }) + + if (!result.ok) { + const message = await result.text().catch(() => "") + throw new Error(message || "ALM request failed") + } + + const body = await result.text().catch(() => "") + console.log("alm response", { body }) + const parsed = body ? JSON.parse(body) : {} + const text = parsed?.choices?.[0]?.message?.content + return Response.parse({ text: typeof text === "string" ? text : "" }) + } +} diff --git a/packages/opencode/src/voice/whisper.ts b/packages/opencode/src/voice/whisper.ts new file mode 100644 index 000000000000..b9a7fa77ac8b --- /dev/null +++ b/packages/opencode/src/voice/whisper.ts @@ -0,0 +1,153 @@ +import { Config } from "@/config/config" +import { Session } from "@/session" +import { tmpdir } from "os" +import path from "path" +import z from "zod" + +export const toWavOrMp3 = async (input: { buffer: ArrayBuffer; mime: string }) => { + const isWav = input.mime.includes("wav") + const isMp3 = input.mime.includes("mpeg") || input.mime.includes("mp3") + if (isWav || isMp3) { + const name = isWav ? "audio.wav" : "audio.mp3" + const mime = isWav ? "audio/wav" : "audio/mpeg" + return { buffer: input.buffer, name, mime } + } + + const outPath = path.join(tmpdir(), `opencode-voice-${crypto.randomUUID()}.mp3`) + const proc = Bun.spawn( + [ + "ffmpeg", + "-y", + "-f", + "webm", + "-i", + "pipe:0", + "-ac", + "1", + "-ar", + "16000", + "-f", + "mp3", + outPath, + ], + { + stdin: "pipe", + stdout: "ignore", + stderr: "ignore", + }, + ) + proc.stdin?.write(new Uint8Array(input.buffer)) + proc.stdin?.end() + await proc.exited + + const file = Bun.file(outPath, { type: "audio/mpeg" }) + const buffer = await file.arrayBuffer().catch(() => undefined) + await Bun.file(outPath).delete().catch(() => {}) + if (!buffer) throw new Error("Failed to convert audio") + return { buffer, name: "audio.mp3", mime: "audio/mpeg" } +} + +export const getLastAssistantText = async (sessionID?: string) => { + if (!sessionID) return "" + return Promise.resolve() + .then(() => Session.messages({ sessionID, limit: 50 })) + .then((messages) => { + for (let i = messages.length - 1; i >= 0; i -= 1) { + const msg = messages[i] + if (msg.info.role !== "assistant") continue + const text = msg.parts + .filter((part) => part.type === "text") + .map((part) => part.text) + .join(" ") + .trim() + if (text) return text + } + return "" + }) + .catch((error) => { + console.log("whisper session lookup failed", { error: String(error) }) + return "" + }) +} + +export const buildPrompt = (input: { prompt?: string; assistant?: string }) => { + const head = input.assistant?.trim() ?? "" + const tail = input.prompt?.trim() ?? "" + if (!head) return tail + if (!tail) return head + return `${head} ${tail}` +} + +export namespace Whisper { + export const Request = z.object({ + file: z.instanceof(File), + mime: z.string(), + sessionID: Session.Info.shape.id.optional(), + prompt: z.string().optional(), + }) + + export const Response = z.object({ + text: z.string().default(""), + }) + + export type Response = z.infer + + export async function transcribe( + input: z.infer & { signal?: AbortSignal; voice?: Config.Info["voice"] }, + ) { + const voice = input.voice ?? (await Config.get()).voice + const whisper = voice?.whisper + const apiKey = whisper?.apiKey + if (!apiKey) { + throw new Error("Missing voice.whisper.apiKey") + } + + const content = await input.file.arrayBuffer() + const prepared = await toWavOrMp3({ + buffer: content, + mime: input.mime, + }) + + const assistant = await getLastAssistantText(input.sessionID) + const prompt = buildPrompt({ assistant, prompt: input.prompt }) + + const form = new FormData() + form.append("file", new Blob([prepared.buffer], { type: prepared.mime }), prepared.name) + form.append("model", whisper?.model ?? "whisper-1") + form.append("response_format", "json") + if (whisper?.language) { + form.append("language", whisper.language) + } + if (prompt) { + form.append("prompt", prompt) + } + + const url = whisper?.url ?? "https://api.openai.com/v1/audio/transcriptions" + console.log("whisper request", { + url, + model: whisper?.model ?? "whisper-1", + language: whisper?.language, + bytes: prepared.buffer.byteLength, + }) + const result = await fetch(url, { + method: "POST", + headers: { + Authorization: `Bearer ${apiKey}`, + }, + body: form, + signal: input.signal, + }) + + if (!result.ok) { + const message = await result.text().catch(() => "") + throw new Error(message || "Whisper request failed") + } + + const contentType = result.headers.get("content-type") ?? "" + const body = await result.text().catch(() => "") + console.log("whisper response", { contentType, body }) + const payload = body ? JSON.parse(body) : { text: "" } + const text = typeof payload?.text === "string" ? payload.text : "" + return Response.parse({ text }) + } +} diff --git a/packages/sdk/js/src/gen/types.gen.ts b/packages/sdk/js/src/gen/types.gen.ts index 8eefe5bfe985..43ad006237d3 100644 --- a/packages/sdk/js/src/gen/types.gen.ts +++ b/packages/sdk/js/src/gen/types.gen.ts @@ -938,6 +938,10 @@ export type KeybindsConfig = { * Paste from clipboard */ input_paste?: string + /** + * Toggle voice input + */ + input_voice?: string /** * Submit input */ @@ -1207,6 +1211,45 @@ export type Config = { * Control diff rendering style: 'auto' adapts to terminal width, 'stacked' always shows single column */ diff_style?: "auto" | "stacked" + /** + * Voice input settings + */ + voice?: { + /** + * Recorder command template with {output} placeholder + */ + command?: Array + /** + * Recorded audio mime type + */ + mime?: string + } + } + /** + * Voice transcription settings + */ + voice?: { + /** + * Whisper transcription settings + */ + whisper?: { + /** + * Whisper API URL + */ + url?: string + /** + * Whisper API key + */ + apiKey?: string + /** + * Whisper model name + */ + model?: string + /** + * Whisper language code + */ + language?: string + } } /** * Command configuration, see https://opencode.ai/docs/commands diff --git a/packages/sdk/js/src/v2/gen/sdk.gen.ts b/packages/sdk/js/src/v2/gen/sdk.gen.ts index b757b7535075..cfb40034fd3d 100644 --- a/packages/sdk/js/src/v2/gen/sdk.gen.ts +++ b/packages/sdk/js/src/v2/gen/sdk.gen.ts @@ -8,6 +8,7 @@ import type { AppLogErrors, AppLogResponses, AppSkillsResponses, + AudioTranscribeResponses, Auth as Auth3, AuthRemoveErrors, AuthRemoveResponses, @@ -2161,6 +2162,27 @@ export class Provider extends HeyApiClient { } } +export class Audio extends HeyApiClient { + /** + * Transcribe audio + * + * Transcribe an audio file with Whisper + */ + public transcribe( + parameters?: { + directory?: string + }, + options?: Options, + ) { + const params = buildClientParams([parameters], [{ args: [{ in: "query", key: "directory" }] }]) + return (options?.client ?? this.client).post({ + url: "/voice/transcribe", + ...options, + ...params, + }) + } +} + export class Find extends HeyApiClient { /** * Find text @@ -3251,6 +3273,11 @@ export class OpencodeClient extends HeyApiClient { return (this._provider ??= new Provider({ client: this.client })) } + private _audio?: Audio + get audio(): Audio { + return (this._audio ??= new Audio({ client: this.client })) + } + private _find?: Find get find(): Find { return (this._find ??= new Find({ client: this.client })) diff --git a/packages/sdk/js/src/v2/gen/types.gen.ts b/packages/sdk/js/src/v2/gen/types.gen.ts index cb1606e3f610..89f6c8142ec7 100644 --- a/packages/sdk/js/src/v2/gen/types.gen.ts +++ b/packages/sdk/js/src/v2/gen/types.gen.ts @@ -1137,6 +1137,10 @@ export type KeybindsConfig = { * Paste from clipboard */ input_paste?: string + /** + * Toggle voice input + */ + input_voice?: string /** * Submit input */ @@ -1640,6 +1644,45 @@ export type Config = { * Control diff rendering style: 'auto' adapts to terminal width, 'stacked' always shows single column */ diff_style?: "auto" | "stacked" + /** + * Voice input settings + */ + voice?: { + /** + * Recorder command template with {output} placeholder + */ + command?: Array + /** + * Recorded audio mime type + */ + mime?: string + } + } + /** + * Voice transcription settings + */ + voice?: { + /** + * Whisper transcription settings + */ + whisper?: { + /** + * Whisper API URL + */ + url?: string + /** + * Whisper API key + */ + apiKey?: string + /** + * Whisper model name + */ + model?: string + /** + * Whisper language code + */ + language?: string + } } server?: ServerConfig /** @@ -4123,6 +4166,26 @@ export type ProviderOauthCallbackResponses = { export type ProviderOauthCallbackResponse = ProviderOauthCallbackResponses[keyof ProviderOauthCallbackResponses] +export type AudioTranscribeData = { + body?: never + path?: never + query?: { + directory?: string + } + url: "/voice/transcribe" +} + +export type AudioTranscribeResponses = { + /** + * Transcription result + */ + 200: { + text?: string + } +} + +export type AudioTranscribeResponse = AudioTranscribeResponses[keyof AudioTranscribeResponses] + export type FindTextData = { body?: never path?: never diff --git a/packages/ui/src/components/icon-button.tsx b/packages/ui/src/components/icon-button.tsx index f1832ce7ffdf..a30e722fd633 100644 --- a/packages/ui/src/components/icon-button.tsx +++ b/packages/ui/src/components/icon-button.tsx @@ -1,5 +1,5 @@ import { Button as Kobalte } from "@kobalte/core/button" -import { type ComponentProps, splitProps } from "solid-js" +import { type ComponentProps, children, splitProps } from "solid-js" import { Icon, IconProps } from "./icon" export interface IconButtonProps extends ComponentProps { @@ -10,7 +10,8 @@ export interface IconButtonProps extends ComponentProps { } export function IconButton(props: ComponentProps<"button"> & IconButtonProps) { - const [split, rest] = splitProps(props, ["variant", "size", "iconSize", "class", "classList"]) + const [split, rest] = splitProps(props, ["variant", "size", "iconSize", "class", "classList", "children"]) + const content = children(() => split.children) return ( & IconButtonProps) { [split.class ?? ""]: !!split.class, }} > - + {content() ?? } ) } diff --git a/packages/ui/src/components/icon.tsx b/packages/ui/src/components/icon.tsx index 544c6abdd214..8e16c8214c5e 100644 --- a/packages/ui/src/components/icon.tsx +++ b/packages/ui/src/components/icon.tsx @@ -61,6 +61,7 @@ const icons = { share: ``, download: ``, menu: ``, + mic: ``, server: ``, branch: ``, edit: ``, diff --git a/packages/web/src/content/docs/config.mdx b/packages/web/src/content/docs/config.mdx index 5cc9d8666a96..4228fffe39a3 100644 --- a/packages/web/src/content/docs/config.mdx +++ b/packages/web/src/content/docs/config.mdx @@ -179,6 +179,56 @@ Available options: --- +### Voice + +Configure voice transcription for Whisper or ALM with the `voice` option. + +```json title="opencode.json" +{ + "$schema": "https://opencode.ai/config.json", + "voice": { + "type": "whisper", + "whisper": { + "url": "http://127.0.0.1:5000/v1/audio/transcriptions", + "apiKey": "{env:OPENCODE_WHISPER_API_KEY}", + "model": "whisper-1", + "language": "en" + } + } +} +``` + +```json title="opencode.json" +{ + "$schema": "https://opencode.ai/config.json", + "voice": { + "type": "alm", + "alm": { + "url": "https://api.openai.com/v1/chat/completions", + "apiKey": "{env:OPENCODE_ALM_API_KEY}", + "model": "gpt-4o-mini-transcribe", + "system": "You are a professional speech-to-text transcriber. Your task is to transcribe the audio into text.", + "prompt": "Keep technical terms unchanged." + } + } +} +``` + +Available options: + +- `type` - Transcription provider (`whisper` or `alm`). +- `whisper.url` - Whisper transcription endpoint URL. +- `whisper.apiKey` - API key for the Whisper service. +- `whisper.model` - Whisper model name (default: `whisper-1`). +- `whisper.language` - Optional language hint (e.g. `en`). +- `alm.url` - Audio LM transcription endpoint URL. +- `alm.apiKey` - API key for the ALM service. +- `alm.model` - Audio LM model name. +- `alm.prompt` - Optional base prompt for transcription. +- `alm.system` - Optional system prompt for transcription. + +--- + ### Server You can configure server settings for the `opencode serve` and `opencode web` commands through the `server` option.