From f5a8eb69fc7ea386638c31b5cbf430b7e4099a86 Mon Sep 17 00:00:00 2001
From: heimoshuiyu <heimoshuiyu@gmail.com>
Date: Thu, 5 Feb 2026 15:44:15 +0800
Subject: [PATCH 1/7] feat: add voice input feature

move whisper config into config

document whisper voice config

remove tui voice enabled

Fix voice error handling and whisper context
---
 packages/app/src/components/prompt-input.tsx  | 224 ++++++++++++++++++
 .../cli/cmd/tui/component/prompt/index.tsx    | 128 ++++++++++
 .../opencode/src/cli/cmd/tui/util/voice.ts    | 132 +++++++++++
 packages/opencode/src/config/config.ts        |  26 ++
 packages/opencode/src/server/routes/voice.ts  |  46 ++++
 packages/opencode/src/server/server.ts        |   2 +
 packages/opencode/src/voice/whisper.ts        | 153 ++++++++++++
 packages/sdk/js/src/gen/types.gen.ts          |  43 ++++
 packages/sdk/js/src/v2/gen/sdk.gen.ts         |  27 +++
 packages/sdk/js/src/v2/gen/types.gen.ts       |  63 +++++
 packages/ui/src/components/icon.tsx           |   1 +
 packages/web/src/content/docs/config.mdx      |  27 +++
 12 files changed, 872 insertions(+)
 create mode 100644 packages/opencode/src/cli/cmd/tui/util/voice.ts
 create mode 100644 packages/opencode/src/server/routes/voice.ts
 create mode 100644 packages/opencode/src/voice/whisper.ts
diff --git a/packages/app/src/components/prompt-input.tsx b/packages/app/src/components/prompt-input.tsx
index b897e394aa18..198460c9323a 100644
--- a/packages/app/src/components/prompt-input.tsx
+++ b/packages/app/src/components/prompt-input.tsx
@@ -251,6 +251,16 @@ export const PromptInput: Component<PromptInputProps> = (props) => {
     applyingHistory: false,
   })
 
+  const [recording, setRecording] = createSignal(false)
+  const [transcribing, setTranscribing] = createSignal(false)
+  const audio = {
+    recorder: undefined as MediaRecorder | undefined,
+    stream: undefined as MediaStream | undefined,
+    controller: undefined as AbortController | undefined,
+    chunks: [] as Blob[],
+    mime: "",
+  }
+
   const MAX_HISTORY = 100
   const [history, setHistory] = persisted(
     Persist.global("prompt-history", ["prompt-history.v1"]),
@@ -384,6 +394,204 @@ export const PromptInput: Component<PromptInputProps> = (props) => {
     addPart({ type: "text", content: plainText, start: 0, end: 0 })
   }
 
+  const isVoiceSupported = () =>
+    typeof navigator !== "undefined" &&
+    typeof window !== "undefined" &&
+    Boolean(navigator.mediaDevices?.getUserMedia) &&
+    typeof MediaRecorder !== "undefined"
+
+  const stopStream = () => {
+    audio.stream?.getTracks().forEach((track) => track.stop())
+    audio.stream = undefined
+  }
+
+  const recordStart = async () => {
+    if (!isVoiceSupported()) {
+      showToast({
+        title: "Voice input unavailable",
+        description: "Your browser does not support audio recording.",
+      })
+      return false
+    }
+    if (audio.recorder) return false
+
+    const stream = await navigator.mediaDevices
+      .getUserMedia({ audio: true })
+      .catch(() => undefined)
+    if (!stream) {
+      showToast({
+        title: "Microphone blocked",
+        description: "Allow microphone access to start recording.",
+      })
+      return false
+    }
+
+    // ensure we can clean up stream even if mime unsupported
+    audio.stream = stream
+
+    const preferred = "audio/webm;codecs=opus"
+    const fallback = "audio/webm"
+    const mime = MediaRecorder.isTypeSupported(preferred)
+      ? preferred
+      : MediaRecorder.isTypeSupported(fallback)
+        ? fallback
+        : ""
+    if (!mime) {
+      stopStream()
+      showToast({
+        title: "Voice input unavailable",
+        description: "This browser does not support the available audio formats.",
+      })
+      return false
+    }
+    const recorder = new MediaRecorder(stream, { mimeType: mime })
+
+    audio.mime = recorder.mimeType || mime
+    audio.chunks = []
+    audio.recorder = recorder
+
+    recorder.ondataavailable = (event) => {
+      if (event.data.size === 0) return
+      audio.chunks.push(event.data)
+    }
+
+    recorder.start()
+    setRecording(true)
+    return true
+  }
+
+  const recordStop = async () => {
+    if (!audio.recorder) return
+    const recorder = audio.recorder
+    audio.recorder = undefined
+
+    const result = new Promise<Blob>((resolve) => {
+      recorder.onstop = () => {
+        resolve(new Blob(audio.chunks, { type: audio.mime || "audio/webm" }))
+      }
+    })
+
+    recorder.stop()
+    const blob = await result
+    stopStream()
+    setRecording(false)
+    return blob
+  }
+
+  const transcribeAudio = async (blob: Blob) => {
+    if (!blob.size) {
+      showToast({
+        title: "No audio captured",
+        description: "Try recording again.",
+      })
+      return
+    }
+
+    const mime = blob.type || "audio/webm"
+    const filename = mime.includes("webm") ? "audio.webm" : "audio.dat"
+    const file = new File([blob], filename, { type: mime })
+    const form = new FormData()
+    const currentPrompt = prompt.current()
+    const promptText = currentPrompt.map((part) => ("content" in part ? part.content : "")).join("")
+    form.append("file", file)
+    if (params.id) {
+      form.append("sessionID", params.id)
+    }
+    if (promptText.trim()) {
+      form.append("prompt", promptText)
+    }
+
+    const fetcher = platform.fetch ?? fetch
+    const controller = new AbortController()
+    audio.controller = controller
+    setTranscribing(true)
+    const response = await fetcher(`${sdk.url}/voice/transcribe`, {
+      method: "POST",
+      body: form,
+      signal: controller.signal,
+    }).catch(() => undefined)
+
+    audio.controller = undefined
+
+    if (!response) {
+      setTranscribing(false)
+      if (controller.signal.aborted) return
+      showToast({
+        title: "Transcription failed",
+        description: "Failed to reach the server.",
+      })
+      return
+    }
+
+    const payload = await response.json().catch(() => ({ text: "" }))
+    const text = typeof payload?.text === "string" ? payload.text : ""
+    setTranscribing(false)
+
+    if (!response.ok) {
+      if (controller.signal.aborted) return
+      showToast({
+        title: "Transcription failed",
+        description: text || "Request failed.",
+      })
+      return
+    }
+
+    if (!text.trim()) {
+      showToast({
+        title: "No speech detected",
+        description: "Try speaking closer to the microphone.",
+      })
+      return
+    }
+
+    addPart({ type: "text", content: text, start: 0, end: 0 })
+    requestAnimationFrame(() => {
+      editorRef.focus()
+      queueScroll()
+    })
+  }
+
+  const toggleVoice = async () => {
+    if (transcribing()) {
+      const controller = audio.controller
+      if (controller) {
+        controller.abort()
+        setTranscribing(false)
+        showToast({
+          title: "Transcription cancelled",
+          description: "Stopped the current transcription.",
+        })
+      }
+      return
+    }
+
+    if (recording()) {
+      const blob = await recordStop()
+      if (!blob) return
+      await transcribeAudio(blob)
+      return
+    }
+
+    await recordStart()
+  }
+
+  const voiceTitle = createMemo(() =>
+    transcribing() ? "Cancel transcription" : recording() ? "Stop recording" : "Voice input",
+  )
+
+  command.register(() => [
+    {
+      id: "prompt.voice",
+      title: "Voice input",
+      description: "Start or stop voice recording",
+      category: "Prompt",
+      keybind: "mod+shift+m",
+      onSelect: () => {
+        void toggleVoice()
+      },
+    },
+  ])
+
   const handleGlobalDragOver = (event: DragEvent) => {
     if (dialog.active) return
 
@@ -428,6 +636,13 @@ export const PromptInput: Component<PromptInputProps> = (props) => {
     document.removeEventListener("dragover", handleGlobalDragOver)
     document.removeEventListener("dragleave", handleGlobalDragLeave)
     document.removeEventListener("drop", handleGlobalDrop)
+    if (transcribing()) {
+      const controller = audio.controller
+      if (controller) controller.abort()
+      setTranscribing(false)
+    }
+    if (!recording()) return
+    void recordStop()
   })
 
   createEffect(() => {
@@ -2049,6 +2264,15 @@ export const PromptInput: Component<PromptInputProps> = (props) => {
                 </Tooltip>
               </Show>
             </div>
+            <TooltipKeybind placement="top" title={voiceTitle()} keybind={command.keybind("prompt.voice")}>
+              <IconButton
+                type="button"
+                icon={transcribing() || recording() ? "stop" : "mic"}
+                variant="ghost"
+                class="h-6 w-6"
+                onClick={toggleVoice}
+              />
+            </TooltipKeybind>
             <Tooltip
               placement="top"
               inactive={!prompt.dirty() && !working()}
diff --git a/packages/opencode/src/cli/cmd/tui/component/prompt/index.tsx b/packages/opencode/src/cli/cmd/tui/component/prompt/index.tsx
index 8576dd5763ab..c5463241e650 100644
--- a/packages/opencode/src/cli/cmd/tui/component/prompt/index.tsx
+++ b/packages/opencode/src/cli/cmd/tui/component/prompt/index.tsx
@@ -32,6 +32,7 @@ import { useToast } from "../../ui/toast"
 import { useKV } from "../../context/kv"
 import { useTextareaKeybindings } from "../textarea-keybindings"
 import { DialogSkill } from "../dialog-skill"
+import { Voice } from "../../util/voice"
 
 export type PromptProps = {
   sessionID?: string
@@ -87,6 +88,13 @@ export function Prompt(props: PromptProps) {
   }
 
   const textareaKeybindings = useTextareaKeybindings()
+  const voiceConfig = createMemo(() => sync.data.config.tui?.voice)
+  const voice = Voice.create({
+    config: voiceConfig,
+    transcription: () => sync.data.config.voice,
+    sessionID: () => props.sessionID,
+    prompt: () => store.prompt.input,
+  })
 
   const fileStyleId = syntax().getStyleId("extmark.file")!
   const agentStyleId = syntax().getStyleId("extmark.agent")!
@@ -123,6 +131,8 @@ export function Prompt(props: PromptProps) {
     extmarkToPartIndex: Map<number, number>
     interrupt: number
     placeholder: number
+    recording: boolean
+    processing: boolean
   }>({
     placeholder: Math.floor(Math.random() * PLACEHOLDERS.length),
     prompt: {
@@ -132,6 +142,8 @@ export function Prompt(props: PromptProps) {
     mode: "normal",
     extmarkToPartIndex: new Map(),
     interrupt: 0,
+    recording: false,
+    processing: false,
   })
 
   // Initialize agent/model/variant from last user message when session changes
@@ -180,6 +192,16 @@ export function Prompt(props: PromptProps) {
           dialog.clear()
         },
       },
+      {
+        title: "Voice input",
+        value: "prompt.voice",
+        disabled: true,
+        keybind: "input_voice",
+        category: "Prompt",
+        onSelect: async () => {
+          await toggleVoice()
+        },
+      },
       {
         title: "Paste",
         value: "prompt.paste",
@@ -680,6 +702,84 @@ export function Prompt(props: PromptProps) {
     )
   }
 
+  async function toggleVoice() {
+    if (store.processing) {
+      const cancelled = voice.cancel()
+      if (cancelled) {
+        setStore("processing", false)
+        toast.show({
+          message: "Transcription cancelled",
+          variant: "info",
+          duration: 1500,
+        })
+      }
+      return
+    }
+
+    if (store.recording) {
+      setStore("recording", false)
+      setStore("processing", true)
+      const result = await voice.stop().catch((error) => {
+        toast.error(error)
+        return undefined
+      })
+      setStore("processing", false)
+      if (result?.cancelled) return
+      if (!result) {
+        toast.show({
+          message: "Recording failed (empty audio)",
+          variant: "warning",
+        })
+        return
+      }
+      if (!result.text.trim()) {
+        toast.show({
+          message: "No speech detected (Whisper returned empty text)",
+          variant: "warning",
+        })
+        return
+      }
+
+      input.insertText(result.text)
+      input.getLayoutNode().markDirty()
+      input.gotoBufferEnd()
+      renderer.requestRender()
+      return
+    }
+
+    const enabled = voice.isEnabled()
+    if (!enabled) {
+      toast.show({
+        message: "Voice input unavailable (disabled or missing OPENCODE_WHISPER_API_KEY)",
+        variant: "warning",
+      })
+      return
+    }
+
+    setStore("recording", true)
+    toast.show({
+      message: "Recording... press keybind again to stop",
+      variant: "info",
+      duration: 2000,
+    })
+    const ok = await voice.start().catch((error) => {
+      toast.error(error)
+      return false
+    })
+    if (ok) return
+    setStore("recording", false)
+    toast.show({
+      message: "Failed to start recording",
+      variant: "error",
+    })
+  }
+
+  onCleanup(() => {
+    if (store.processing) voice.cancel()
+    if (!store.recording) return
+    voice.stop().catch(() => {})
+  })
+
   async function pasteImage(file: { filename?: string; content: string; mime: string }) {
     const currentOffset = input.visualCursor.offset
     const extmarkStart = currentOffset
@@ -736,6 +836,19 @@ export function Prompt(props: PromptProps) {
     return !!current
   })
 
+  const voiceEnabled = createMemo(() => voice.isEnabled())
+  const voiceLabel = createMemo(() => {
+    if (store.processing) return "Transcribing"
+    if (store.recording) return "Stop"
+    return "Record"
+  })
+  const voiceColor = createMemo(() => {
+    if (store.processing) return theme.warning
+    if (store.recording) return theme.warning
+    if (!voiceEnabled()) return theme.textMuted
+    return theme.text
+  })
+
   const spinnerDef = createMemo(() => {
     const color = local.agent.color(local.agent.current().name)
     return {
@@ -831,6 +944,11 @@ export function Prompt(props: PromptProps) {
                   }
                   // If no image, let the default paste behavior continue
                 }
+                if (keybind.match("input_voice", e)) {
+                  e.preventDefault()
+                  await toggleVoice()
+                  return
+                }
                 if (keybind.match("input_clear", e) && store.prompt.input !== "") {
                   input.clear()
                   input.extmarks.clear()
@@ -991,6 +1109,16 @@ export function Prompt(props: PromptProps) {
                   </Show>
                 </box>
               </Show>
+              <box flexGrow={1} />
+              <box
+                flexDirection="row"
+                onMouseUp={async () => {
+                  if (!voiceEnabled() && !store.recording && !store.processing) return
+                  await toggleVoice()
+                }}
+              >
+                <text fg={voiceColor()}>{voiceLabel()}</text>
+              </box>
             </box>
           </box>
         </box>
diff --git a/packages/opencode/src/cli/cmd/tui/util/voice.ts b/packages/opencode/src/cli/cmd/tui/util/voice.ts
new file mode 100644
index 000000000000..f176c8495e84
--- /dev/null
+++ b/packages/opencode/src/cli/cmd/tui/util/voice.ts
@@ -0,0 +1,132 @@
+import { tmpdir } from "os"
+import path from "path"
+import { Config } from "@/config/config"
+import { Whisper } from "@/voice/whisper"
+
+export type VoiceConfig = {
+  command?: string[]
+  mime?: string
+}
+
+const defaultCommands = [
+  ["ffmpeg", "-y", "-f", "pulse", "-i", "default", "-ac", "1", "-ar", "16000", "-f", "mp3", "{output}"],
+  ["ffmpeg", "-y", "-f", "alsa", "-i", "default", "-ac", "1", "-ar", "16000", "-f", "mp3", "{output}"],
+  ["sox", "-d", "-c", "1", "-r", "16000", "{output}"],
+  ["rec", "-c", "1", "-r", "16000", "{output}"],
+  ["arecord", "-f", "S16_LE", "-c", "1", "-r", "16000", "{output}"],
+]
+
+const defaultMime = "audio/mpeg"
+
+const pickCommand = (config?: VoiceConfig) => {
+  if (config?.command?.length) return config.command
+  for (const candidate of defaultCommands) {
+    const bin = candidate[0]
+    if (!bin) continue
+    if (Bun.which(bin)) return candidate
+  }
+  return undefined
+}
+
+const readStream = async (stream?: ReadableStream<Uint8Array> | number | null) => {
+  if (!stream || typeof stream === "number") return ""
+  return new Response(stream).text().catch(() => "")
+}
+
+export namespace Voice {
+  export function create(input: {
+    config: () => VoiceConfig | undefined
+    transcription?: () => Config.Info["voice"] | undefined
+    sessionID?: () => string | undefined
+    prompt?: () => string | undefined
+  }) {
+    const state = {
+      proc: undefined as ReturnType<typeof Bun.spawn> | undefined,
+      output: undefined as string | undefined,
+      controller: undefined as AbortController | undefined,
+      cancelled: false,
+    }
+
+    const isEnabled = () => {
+      if (!input.transcription?.()?.whisper?.apiKey) return false
+      return true
+    }
+
+    const start = async () => {
+      if (state.proc) return false
+      const config = input.config()
+      const command = pickCommand(config)
+      if (!command) return false
+      state.output = path.join(tmpdir(), `opencode-voice-${crypto.randomUUID()}.mp3`)
+      const args = command.map((entry) => entry.replaceAll("{output}", state.output!))
+      state.proc = Bun.spawn(args, { stdout: "pipe", stderr: "pipe" })
+      console.log("voice recorder started", { args, output: state.output })
+      await Bun.sleep(100)
+      return true
+    }
+
+    const stop = async () => {
+      if (!state.proc || !state.output) return
+      const target = state.proc
+      state.proc = undefined
+      const pathResult = state.output
+      state.output = undefined
+      target.kill()
+      await target.exited.catch(() => {})
+
+      const stdout = await readStream(target.stdout)
+      const stderr = await readStream(target.stderr)
+      if (stdout || stderr) {
+        console.log("voice recorder output", { stdout, stderr })
+      }
+
+      const mime = input.config()?.mime ?? defaultMime
+      const buffer = await Bun.file(pathResult).arrayBuffer().catch(() => undefined)
+      console.log("voice recorder bytes", { bytes: buffer?.byteLength ?? 0 })
+      await Bun.file(pathResult).delete().catch(() => {})
+      if (!buffer) return
+
+      const blob = new Blob([buffer], { type: mime })
+      const apiFile = new File([blob], "audio.mp3", { type: mime })
+      console.log("whisper transcribe start", {
+        bytes: buffer.byteLength,
+        url: input.transcription?.()?.whisper?.url,
+        model: input.transcription?.()?.whisper?.model,
+        language: input.transcription?.()?.whisper?.language,
+      })
+      state.cancelled = false
+      state.controller = new AbortController()
+      const result = await Whisper.transcribe({
+        file: apiFile,
+        mime,
+        sessionID: input.sessionID?.(),
+        prompt: input.prompt?.(),
+        signal: state.controller.signal,
+        voice: input.transcription?.(),
+      })
+        .then((response) => ({ text: response.text, cancelled: false }))
+        .catch((error) => {
+          console.log("whisper transcribe failed", { error: String(error) })
+          if (error?.name === "AbortError" || state.cancelled) return { text: "", cancelled: true }
+          throw error
+        })
+      state.controller = undefined
+      if (!result) return
+      return result
+    }
+
+    const cancel = () => {
+      if (!state.controller) return false
+      state.cancelled = true
+      state.controller.abort()
+      return true
+    }
+
+    return {
+      isEnabled,
+      start,
+      stop,
+      cancel,
+    }
+  }
+}
diff --git a/packages/opencode/src/config/config.ts b/packages/opencode/src/config/config.ts
index dfb86dbe26f3..864b69c8eafa 100644
--- a/packages/opencode/src/config/config.ts
+++ b/packages/opencode/src/config/config.ts
@@ -823,6 +823,7 @@ export namespace Config {
       variant_cycle: z.string().optional().default("ctrl+t").describe("Cycle model variants"),
       input_clear: z.string().optional().default("ctrl+c").describe("Clear input field"),
       input_paste: z.string().optional().default("ctrl+v").describe("Paste from clipboard"),
+      input_voice: z.string().optional().default("<leader>v").describe("Toggle voice input"),
       input_submit: z.string().optional().default("return").describe("Submit input"),
       input_newline: z
         .string()
@@ -929,8 +930,32 @@ export namespace Config {
       .enum(["auto", "stacked"])
       .optional()
       .describe("Control diff rendering style: 'auto' adapts to terminal width, 'stacked' always shows single column"),
+    voice: z
+      .object({
+        command: z
+          .array(z.string())
+          .optional()
+          .describe("Recorder command template with {output} placeholder"),
+        mime: z.string().optional().describe("Recorded audio mime type"),
+      })
+      .optional()
+      .describe("Voice input settings"),
   })
 
+  export const Voice = z
+    .object({
+      whisper: z
+        .object({
+          url: z.string().optional().describe("Whisper API URL"),
+          apiKey: z.string().optional().describe("Whisper API key"),
+          model: z.string().optional().describe("Whisper model name"),
+          language: z.string().optional().describe("Whisper language code"),
+        })
+        .optional()
+        .describe("Whisper transcription settings"),
+    })
+    .describe("Voice transcription settings")
+
   export const Server = z
     .object({
       port: z.number().int().positive().optional().describe("Port to listen on"),
@@ -1009,6 +1034,7 @@ export namespace Config {
       keybinds: Keybinds.optional().describe("Custom keybind configurations"),
       logLevel: Log.Level.optional().describe("Log level"),
       tui: TUI.optional().describe("TUI specific settings"),
+      voice: Voice.optional().describe("Voice transcription settings"),
       server: Server.optional().describe("Server configuration for opencode serve and web commands"),
       command: z
         .record(z.string(), Command)
diff --git a/packages/opencode/src/server/routes/voice.ts b/packages/opencode/src/server/routes/voice.ts
new file mode 100644
index 000000000000..f044a4d506fe
--- /dev/null
+++ b/packages/opencode/src/server/routes/voice.ts
@@ -0,0 +1,46 @@
+import { describeRoute, resolver } from "hono-openapi"
+import { zValidator } from "@hono/zod-validator"
+import z from "zod"
+import { Whisper } from "@/voice/whisper"
+import { lazy } from "@/util/lazy"
+import { Hono } from "hono"
+
+export const VoiceRoutes = lazy(() =>
+  new Hono().post(
+    "/transcribe",
+    describeRoute({
+      summary: "Transcribe audio",
+      description: "Transcribe an audio file with Whisper",
+      operationId: "audio.transcribe",
+      responses: {
+        200: {
+          description: "Transcription result",
+          content: {
+            "application/json": {
+              schema: resolver(Whisper.Response),
+            },
+          },
+        },
+      },
+    }),
+    zValidator(
+      "form",
+      z.object({
+        file: z.instanceof(File),
+        sessionID: z.string().optional(),
+        prompt: z.string().optional(),
+      }),
+    ),
+    async (c) => {
+      const data = c.req.valid("form")
+      const file = data.file
+      const result = await Whisper.transcribe({
+        file,
+        mime: file.type || "audio/wav",
+        sessionID: data.sessionID,
+        prompt: data.prompt,
+      })
+      return c.json(result)
+    },
+  ),
+)
diff --git a/packages/opencode/src/server/server.ts b/packages/opencode/src/server/server.ts
index 015553802a47..e01aa4276448 100644
--- a/packages/opencode/src/server/server.ts
+++ b/packages/opencode/src/server/server.ts
@@ -40,6 +40,7 @@ import { QuestionRoutes } from "./routes/question"
 import { PermissionRoutes } from "./routes/permission"
 import { GlobalRoutes } from "./routes/global"
 import { MDNS } from "./mdns"
+import { VoiceRoutes } from "./routes/voice"
 
 // @ts-ignore This global is needed to prevent ai-sdk from logging warnings to stdout https://github.com/vercel/ai/blob/2dc67e0ef538307f21368db32d5a12345d98831b/packages/ai/src/logger/log-warnings.ts#L85
 globalThis.AI_SDK_LOG_WARNINGS = false
@@ -224,6 +225,7 @@ export namespace Server {
         .route("/permission", PermissionRoutes())
         .route("/question", QuestionRoutes())
         .route("/provider", ProviderRoutes())
+        .route("/voice", VoiceRoutes())
         .route("/", FileRoutes())
         .route("/mcp", McpRoutes())
         .route("/tui", TuiRoutes())
diff --git a/packages/opencode/src/voice/whisper.ts b/packages/opencode/src/voice/whisper.ts
new file mode 100644
index 000000000000..8a3c1f91f424
--- /dev/null
+++ b/packages/opencode/src/voice/whisper.ts
@@ -0,0 +1,153 @@
+import { Config } from "@/config/config"
+import { Session } from "@/session"
+import { tmpdir } from "os"
+import path from "path"
+import z from "zod"
+
+const toWavOrMp3 = async (input: { buffer: ArrayBuffer; mime: string }) => {
+  const isWav = input.mime.includes("wav")
+  const isMp3 = input.mime.includes("mpeg") || input.mime.includes("mp3")
+  if (isWav || isMp3) {
+    const name = isWav ? "audio.wav" : "audio.mp3"
+    const mime = isWav ? "audio/wav" : "audio/mpeg"
+    return { buffer: input.buffer, name, mime }
+  }
+
+  const outPath = path.join(tmpdir(), `opencode-voice-${crypto.randomUUID()}.mp3`)
+  const proc = Bun.spawn(
+    [
+      "ffmpeg",
+      "-y",
+      "-f",
+      "webm",
+      "-i",
+      "pipe:0",
+      "-ac",
+      "1",
+      "-ar",
+      "16000",
+      "-f",
+      "mp3",
+      outPath,
+    ],
+    {
+      stdin: "pipe",
+      stdout: "ignore",
+      stderr: "ignore",
+    },
+  )
+  proc.stdin?.write(new Uint8Array(input.buffer))
+  proc.stdin?.end()
+  await proc.exited
+
+  const file = Bun.file(outPath, { type: "audio/mpeg" })
+  const buffer = await file.arrayBuffer().catch(() => undefined)
+  await Bun.file(outPath).delete().catch(() => {})
+  if (!buffer) throw new Error("Failed to convert audio")
+  return { buffer, name: "audio.mp3", mime: "audio/mpeg" }
+}
+
+const getLastAssistantText = async (sessionID?: string) => {
+  if (!sessionID) return ""
+  return Promise.resolve()
+    .then(() => Session.messages({ sessionID, limit: 50 }))
+    .then((messages) => {
+      for (let i = messages.length - 1; i >= 0; i -= 1) {
+        const msg = messages[i]
+        if (msg.info.role !== "assistant") continue
+        const text = msg.parts
+          .filter((part) => part.type === "text")
+          .map((part) => part.text)
+          .join(" ")
+          .trim()
+        if (text) return text
+      }
+      return ""
+    })
+    .catch((error) => {
+      console.log("whisper session lookup failed", { error: String(error) })
+      return ""
+    })
+}
+
+const buildPrompt = (input: { prompt?: string; assistant?: string }) => {
+  const head = input.assistant?.trim() ?? ""
+  const tail = input.prompt?.trim() ?? ""
+  if (!head) return tail
+  if (!tail) return head
+  return `${head} ${tail}`
+}
+
+export namespace Whisper {
+  export const Request = z.object({
+    file: z.instanceof(File),
+    mime: z.string(),
+    sessionID: Session.Info.shape.id.optional(),
+    prompt: z.string().optional(),
+  })
+
+  export const Response = z.object({
+    text: z.string().default(""),
+  })
+
+  export type Response = z.infer<typeof Response>
+
+  export async function transcribe(
+    input: z.infer<typeof Request> & { signal?: AbortSignal; voice?: Config.Info["voice"] },
+  ) {
+    const voice = input.voice ?? (await Config.get()).voice
+    const whisper = voice?.whisper
+    const apiKey = whisper?.apiKey
+    if (!apiKey) {
+      throw new Error("Missing voice.whisper.apiKey")
+    }
+
+    const content = await input.file.arrayBuffer()
+    const prepared = await toWavOrMp3({
+      buffer: content,
+      mime: input.mime,
+    })
+
+    const assistant = await getLastAssistantText(input.sessionID)
+    const prompt = buildPrompt({ assistant, prompt: input.prompt })
+
+    const form = new FormData()
+    form.append("file", new Blob([prepared.buffer], { type: prepared.mime }), prepared.name)
+    form.append("model", whisper?.model ?? "whisper-1")
+    form.append("response_format", "json")
+    if (whisper?.language) {
+      form.append("language", whisper.language)
+    }
+    if (prompt) {
+      form.append("prompt", prompt)
+    }
+
+    const url = whisper?.url ?? "http://127.0.0.1:5000/v1/audio/transcriptions"
+    console.log("whisper request", {
+      url,
+      model: whisper?.model ?? "whisper-1",
+      language: whisper?.language,
+      bytes: prepared.buffer.byteLength,
+    })
+    const result = await fetch(url, {
+      method: "POST",
+      headers: {
+        Authorization: `Bearer ${apiKey}`,
+      },
+      body: form,
+      signal: input.signal,
+    })
+
+    if (!result.ok) {
+      const message = await result.text().catch(() => "")
+      throw new Error(message || "Whisper request failed")
+    }
+
+    const contentType = result.headers.get("content-type") ?? ""
+    const body = await result.text().catch(() => "")
+    console.log("whisper response", { contentType, body })
+    const payload = body ? JSON.parse(body) : { text: "" }
+    const text = typeof payload?.text === "string" ? payload.text : ""
+    return Response.parse({ text })
+  }
+}
diff --git a/packages/sdk/js/src/gen/types.gen.ts b/packages/sdk/js/src/gen/types.gen.ts
index 8eefe5bfe985..43ad006237d3 100644
--- a/packages/sdk/js/src/gen/types.gen.ts
+++ b/packages/sdk/js/src/gen/types.gen.ts
@@ -938,6 +938,10 @@ export type KeybindsConfig = {
    * Paste from clipboard
    */
   input_paste?: string
+  /**
+   * Toggle voice input
+   */
+  input_voice?: string
   /**
    * Submit input
    */
@@ -1207,6 +1211,45 @@ export type Config = {
      * Control diff rendering style: 'auto' adapts to terminal width, 'stacked' always shows single column
      */
     diff_style?: "auto" | "stacked"
+    /**
+     * Voice input settings
+     */
+    voice?: {
+      /**
+       * Recorder command template with {output} placeholder
+       */
+      command?: Array<string>
+      /**
+       * Recorded audio mime type
+       */
+      mime?: string
+    }
+  }
+  /**
+   * Voice transcription settings
+   */
+  voice?: {
+    /**
+     * Whisper transcription settings
+     */
+    whisper?: {
+      /**
+       * Whisper API URL
+       */
+      url?: string
+      /**
+       * Whisper API key
+       */
+      apiKey?: string
+      /**
+       * Whisper model name
+       */
+      model?: string
+      /**
+       * Whisper language code
+       */
+      language?: string
+    }
   }
   /**
    * Command configuration, see https://opencode.ai/docs/commands
diff --git a/packages/sdk/js/src/v2/gen/sdk.gen.ts b/packages/sdk/js/src/v2/gen/sdk.gen.ts
index b757b7535075..cfb40034fd3d 100644
--- a/packages/sdk/js/src/v2/gen/sdk.gen.ts
+++ b/packages/sdk/js/src/v2/gen/sdk.gen.ts
@@ -8,6 +8,7 @@ import type {
   AppLogErrors,
   AppLogResponses,
   AppSkillsResponses,
+  AudioTranscribeResponses,
   Auth as Auth3,
   AuthRemoveErrors,
   AuthRemoveResponses,
@@ -2161,6 +2162,27 @@ export class Provider extends HeyApiClient {
   }
 }
 
+export class Audio extends HeyApiClient {
+  /**
+   * Transcribe audio
+   *
+   * Transcribe an audio file with Whisper
+   */
+  public transcribe<ThrowOnError extends boolean = false>(
+    parameters?: {
+      directory?: string
+    },
+    options?: Options<never, ThrowOnError>,
+  ) {
+    const params = buildClientParams([parameters], [{ args: [{ in: "query", key: "directory" }] }])
+    return (options?.client ?? this.client).post<AudioTranscribeResponses, unknown, ThrowOnError>({
+      url: "/voice/transcribe",
+      ...options,
+      ...params,
+    })
+  }
+}
+
 export class Find extends HeyApiClient {
   /**
    * Find text
@@ -3251,6 +3273,11 @@ export class OpencodeClient extends HeyApiClient {
     return (this._provider ??= new Provider({ client: this.client }))
   }
 
+  private _audio?: Audio
+  get audio(): Audio {
+    return (this._audio ??= new Audio({ client: this.client }))
+  }
+
   private _find?: Find
   get find(): Find {
     return (this._find ??= new Find({ client: this.client }))
diff --git a/packages/sdk/js/src/v2/gen/types.gen.ts b/packages/sdk/js/src/v2/gen/types.gen.ts
index cb1606e3f610..89f6c8142ec7 100644
--- a/packages/sdk/js/src/v2/gen/types.gen.ts
+++ b/packages/sdk/js/src/v2/gen/types.gen.ts
@@ -1137,6 +1137,10 @@ export type KeybindsConfig = {
    * Paste from clipboard
    */
   input_paste?: string
+  /**
+   * Toggle voice input
+   */
+  input_voice?: string
   /**
    * Submit input
    */
@@ -1640,6 +1644,45 @@ export type Config = {
      * Control diff rendering style: 'auto' adapts to terminal width, 'stacked' always shows single column
      */
     diff_style?: "auto" | "stacked"
+    /**
+     * Voice input settings
+     */
+    voice?: {
+      /**
+       * Recorder command template with {output} placeholder
+       */
+      command?: Array<string>
+      /**
+       * Recorded audio mime type
+       */
+      mime?: string
+    }
+  }
+  /**
+   * Voice transcription settings
+   */
+  voice?: {
+    /**
+     * Whisper transcription settings
+     */
+    whisper?: {
+      /**
+       * Whisper API URL
+       */
+      url?: string
+      /**
+       * Whisper API key
+       */
+      apiKey?: string
+      /**
+       * Whisper model name
+       */
+      model?: string
+      /**
+       * Whisper language code
+       */
+      language?: string
+    }
   }
   server?: ServerConfig
   /**
@@ -4123,6 +4166,26 @@ export type ProviderOauthCallbackResponses = {
 
 export type ProviderOauthCallbackResponse = ProviderOauthCallbackResponses[keyof ProviderOauthCallbackResponses]
 
+export type AudioTranscribeData = {
+  body?: never
+  path?: never
+  query?: {
+    directory?: string
+  }
+  url: "/voice/transcribe"
+}
+
+export type AudioTranscribeResponses = {
+  /**
+   * Transcription result
+   */
+  200: {
+    text?: string
+  }
+}
+
+export type AudioTranscribeResponse = AudioTranscribeResponses[keyof AudioTranscribeResponses]
+
 export type FindTextData = {
   body?: never
   path?: never
diff --git a/packages/ui/src/components/icon.tsx b/packages/ui/src/components/icon.tsx
index 544c6abdd214..8e16c8214c5e 100644
--- a/packages/ui/src/components/icon.tsx
+++ b/packages/ui/src/components/icon.tsx
@@ -61,6 +61,7 @@ const icons = {
   share: `<path d="M10.0013 12.0846L10.0013 3.33464M13.7513 6.66797L10.0013 2.91797L6.2513 6.66797M17.0846 10.418V17.0846H2.91797V10.418" stroke="currentColor" stroke-linecap="square"/>`,
   download: `<path d="M13.9583 10.6257L10 14.584L6.04167 10.6257M10 2.08398V13.959M16.25 17.9173H3.75" stroke="currentColor" stroke-linecap="square"/>`,
   menu: `<path d="M2.5 5H17.5M2.5 10H17.5M2.5 15H17.5" stroke="currentColor" stroke-linecap="square"/>`,
+  mic: `<path d="M9.99984 12.0833C8.61912 12.0833 7.49984 10.964 7.49984 9.58329V5.41663C7.49984 4.03592 8.61912 2.91663 9.99984 2.91663C11.3806 2.91663 12.4998 4.03592 12.4998 5.41663V9.58329C12.4998 10.964 11.3806 12.0833 9.99984 12.0833Z" stroke="currentColor" stroke-linecap="square"/><path d="M5.83317 9.58325C5.83317 11.6543 7.5121 13.3333 9.58317 13.3333H10.4165C12.4876 13.3333 14.1665 11.6543 14.1665 9.58325" stroke="currentColor" stroke-linecap="square"/><path d="M9.99984 13.3333V17.0833M7.08317 17.0833H12.9165" stroke="currentColor" stroke-linecap="square"/>`,
   server: `<rect x="3.35547" y="1.92969" width="13.2857" height="16.1429" stroke="currentColor"/><rect x="3.35547" y="11.9297" width="13.2857" height="6.14286" stroke="currentColor"/><rect x="12.8555" y="14.2852" width="1.42857" height="1.42857" fill="currentColor"/><rect x="10" y="14.2852" width="1.42857" height="1.42857" fill="currentColor"/>`,
   branch: `<path d="M14.2036 7.19987L14.2079 6.69989L13.2079 6.69132L13.2036 7.1913L13.7036 7.19559L14.2036 7.19987ZM8.14804 5.09032H7.64804C7.64804 5.75797 7.06861 6.34471 6.29619 6.34471V6.84471V7.34471C7.56926 7.34471 8.64804 6.36051 8.64804 5.09032H8.14804ZM6.29619 6.84471V6.34471C5.52376 6.34471 4.94434 5.75797 4.94434 5.09032H4.44434H3.94434C3.94434 6.36051 5.02311 7.34471 6.29619 7.34471V6.84471ZM4.44434 5.09032H4.94434C4.94434 4.42267 5.52376 3.83594 6.29619 3.83594V3.33594V2.83594C5.02311 2.83594 3.94434 3.82013 3.94434 5.09032H4.44434ZM6.29619 3.33594V3.83594C7.06861 3.83594 7.64804 4.42267 7.64804 5.09032H8.14804H8.64804C8.64804 3.82013 7.56926 2.83594 6.29619 2.83594V3.33594ZM8.14804 14.9149H7.64804C7.64804 15.5825 7.06861 16.1693 6.29619 16.1693V16.6693V17.1693C7.56926 17.1693 8.64804 16.1851 8.64804 14.9149H8.14804ZM6.29619 16.6693V16.1693C5.52376 16.1693 4.94434 15.5825 4.94434 14.9149H4.44434H3.94434C3.94434 16.1851 5.02311 17.1693 6.29619 17.1693V16.6693ZM4.44434 14.9149H4.94434C4.94434 14.2472 5.52376 13.6605 6.29619 13.6605V13.1605V12.6605C5.02311 12.6605 3.94434 13.6447 3.94434 14.9149H4.44434ZM6.29619 13.1605V13.6605C7.06861 13.6605 7.64804 14.2472 7.64804 14.9149H8.14804H8.64804C8.64804 13.6447 7.56926 12.6605 6.29619 12.6605V13.1605ZM15.5554 5.09032H15.0554C15.0554 5.75797 14.476 6.34471 13.7036 6.34471V6.84471V7.34471C14.9767 7.34471 16.0554 6.36051 16.0554 5.09032H15.5554ZM13.7036 6.84471V6.34471C12.9312 6.34471 12.3517 5.75797 12.3517 5.09032H11.8517H11.3517C11.3517 6.36051 12.4305 7.34471 13.7036 7.34471V6.84471ZM11.8517 5.09032H12.3517C12.3517 4.42267 12.9312 3.83594 13.7036 3.83594V3.33594V2.83594C12.4305 2.83594 11.3517 3.82013 11.3517 5.09032H11.8517ZM13.7036 3.33594V3.83594C14.476 3.83594 15.0554 4.42267 15.0554 5.09032H15.5554H16.0554C16.0554 3.82013 14.9767 2.83594 13.7036 2.83594V3.33594ZM13.7036 7.19559L13.2036 7.1913L13.1544 12.9277L13.6544 12.932L14.1544 12.9363L14.2036 7.19987L13.7036 7.19559ZM6.29619 6.84471H5.79619V13.1605H6.29619H6.79619V6.84471H6.29619ZM11.6545 14.9149V14.4149H8.14804V14.9149V15.4149H11.6545V14.9149ZM13.6544 12.932L13.1544 12.9277C13.1474 13.7511 12.4779 14.4149 11.6545 14.4149V14.9149V15.4149C13.0269 15.4149 14.1426 14.3086 14.1544 12.9363L13.6544 12.932Z" fill="currentColor"/>`,
   edit: `<path d="M17.0832 17.0807V17.5807H17.5832V17.0807H17.0832ZM2.9165 17.0807H2.4165V17.5807H2.9165V17.0807ZM2.9165 2.91406V2.41406H2.4165V2.91406H2.9165ZM9.58317 3.41406H10.0832V2.41406H9.58317V2.91406V3.41406ZM17.5832 10.4141V9.91406H16.5832V10.4141H17.0832H17.5832ZM6.24984 11.2474L5.89628 10.8938L5.74984 11.0403V11.2474H6.24984ZM6.24984 13.7474H5.74984V14.2474H6.24984V13.7474ZM8.74984 13.7474V14.2474H8.95694L9.10339 14.101L8.74984 13.7474ZM15.2082 2.28906L15.5617 1.93551L15.2082 1.58196L14.8546 1.93551L15.2082 2.28906ZM17.7082 4.78906L18.0617 5.14262L18.4153 4.78906L18.0617 4.43551L17.7082 4.78906ZM17.0832 17.0807V16.5807H2.9165V17.0807V17.5807H17.0832V17.0807ZM2.9165 17.0807H3.4165V2.91406H2.9165H2.4165V17.0807H2.9165ZM2.9165 2.91406V3.41406H9.58317V2.91406V2.41406H2.9165V2.91406ZM17.0832 10.4141H16.5832V17.0807H17.0832H17.5832V10.4141H17.0832ZM6.24984 11.2474H5.74984V13.7474H6.24984H6.74984V11.2474H6.24984ZM6.24984 13.7474V14.2474H8.74984V13.7474V13.2474H6.24984V13.7474ZM6.24984 11.2474L6.60339 11.6009L15.5617 2.64262L15.2082 2.28906L14.8546 1.93551L5.89628 10.8938L6.24984 11.2474ZM15.2082 2.28906L14.8546 2.64262L17.3546 5.14262L17.7082 4.78906L18.0617 4.43551L15.5617 1.93551L15.2082 2.28906ZM17.7082 4.78906L17.3546 4.43551L8.39628 13.3938L8.74984 13.7474L9.10339 14.101L18.0617 5.14262L17.7082 4.78906Z" fill="currentColor"/>`,
diff --git a/packages/web/src/content/docs/config.mdx b/packages/web/src/content/docs/config.mdx
index 5cc9d8666a96..5433a0c73cba 100644
--- a/packages/web/src/content/docs/config.mdx
+++ b/packages/web/src/content/docs/config.mdx
@@ -179,6 +179,33 @@ Available options:
 
 ---
 
+### Voice
+
+Configure voice transcription for the Whisper API with the `voice` option.
+
+```json title="opencode.json"
+{
+  "$schema": "https://opencode.ai/config.json",
+  "voice": {
+    "whisper": {
+      "url": "http://127.0.0.1:5000/v1/audio/transcriptions",
+      "apiKey": "{env:OPENCODE_WHISPER_API_KEY}",
+      "model": "whisper-1",
+      "language": "en"
+    }
+  }
+}
+```
+
+Available options:
+
+- `whisper.url` - Whisper transcription endpoint URL.
+- `whisper.apiKey` - API key for the Whisper service.
+- `whisper.model` - Whisper model name (default: `whisper-1`).
+- `whisper.language` - Optional language hint (e.g. `en`).
+
+---
+
 ### Server
 
 You can configure server settings for the `opencode serve` and `opencode web` commands through the `server` option.

From 28813065128c1f21f53309893d2333d33dbc80b6 Mon Sep 17 00:00:00 2001
From: heimoshuiyu <heimoshuiyu@gmail.com>
Date: Sun, 18 Jan 2026 23:18:55 +0800
Subject: [PATCH 2/7] feat: add ALM voice transcription

---
 .../cli/cmd/tui/component/prompt/index.tsx    |   4 +-
 .../opencode/src/cli/cmd/tui/util/voice.ts    |  68 +++++++++---
 packages/opencode/src/config/config.ts        |  11 ++
 packages/opencode/src/server/routes/voice.ts  |  37 +++++--
 packages/opencode/src/voice/alm.ts            | 100 ++++++++++++++++++
 packages/opencode/src/voice/whisper.ts        |   6 +-
 packages/web/src/content/docs/config.mdx      |  25 ++++-
 7 files changed, 221 insertions(+), 30 deletions(-)
 create mode 100644 packages/opencode/src/voice/alm.ts

diff --git a/packages/opencode/src/cli/cmd/tui/component/prompt/index.tsx b/packages/opencode/src/cli/cmd/tui/component/prompt/index.tsx
index c5463241e650..ec45e0feab67 100644
--- a/packages/opencode/src/cli/cmd/tui/component/prompt/index.tsx
+++ b/packages/opencode/src/cli/cmd/tui/component/prompt/index.tsx
@@ -734,7 +734,7 @@ export function Prompt(props: PromptProps) {
       }
       if (!result.text.trim()) {
         toast.show({
-          message: "No speech detected (Whisper returned empty text)",
+          message: "No speech detected (transcription returned empty text)",
           variant: "warning",
         })
         return
@@ -750,7 +750,7 @@ export function Prompt(props: PromptProps) {
     const enabled = voice.isEnabled()
     if (!enabled) {
       toast.show({
-        message: "Voice input unavailable (disabled or missing OPENCODE_WHISPER_API_KEY)",
+        message: "Voice input unavailable (missing transcription API key)",
         variant: "warning",
       })
       return
diff --git a/packages/opencode/src/cli/cmd/tui/util/voice.ts b/packages/opencode/src/cli/cmd/tui/util/voice.ts
index f176c8495e84..4d043a44a49c 100644
--- a/packages/opencode/src/cli/cmd/tui/util/voice.ts
+++ b/packages/opencode/src/cli/cmd/tui/util/voice.ts
@@ -1,6 +1,7 @@
 import { tmpdir } from "os"
 import path from "path"
 import { Config } from "@/config/config"
+import { Alm } from "@/voice/alm"
 import { Whisper } from "@/voice/whisper"
 
 export type VoiceConfig = {
@@ -18,6 +19,15 @@ const defaultCommands = [
 
 const defaultMime = "audio/mpeg"
 
+const resolveType = (voice?: Config.Info["voice"]) => {
+  if (voice?.type) return voice.type
+  if (voice?.whisper?.apiKey && !voice?.alm?.apiKey) return "whisper"
+  if (voice?.alm?.apiKey && !voice?.whisper?.apiKey) return "alm"
+  if (voice?.whisper?.apiKey) return "whisper"
+  if (voice?.alm?.apiKey) return "alm"
+  return "whisper"
+}
+
 const pickCommand = (config?: VoiceConfig) => {
   if (config?.command?.length) return config.command
   for (const candidate of defaultCommands) {
@@ -48,8 +58,10 @@ export namespace Voice {
     }
 
     const isEnabled = () => {
-      if (!input.transcription?.()?.whisper?.apiKey) return false
-      return true
+      const voice = input.transcription?.()
+      const type = resolveType(voice)
+      if (type === "alm") return !!voice?.alm?.apiKey
+      return !!voice?.whisper?.apiKey
     }
 
     const start = async () => {
@@ -88,25 +100,47 @@ export namespace Voice {
 
       const blob = new Blob([buffer], { type: mime })
       const apiFile = new File([blob], "audio.mp3", { type: mime })
-      console.log("whisper transcribe start", {
-        bytes: buffer.byteLength,
-        url: input.transcription?.()?.whisper?.url,
-        model: input.transcription?.()?.whisper?.model,
-        language: input.transcription?.()?.whisper?.language,
-      })
+      const voice = input.transcription?.()
+      const type = resolveType(voice)
+      if (type === "alm") {
+        console.log("voice transcribe start", {
+          provider: "alm",
+          bytes: buffer.byteLength,
+          url: voice?.alm?.url,
+          model: voice?.alm?.model,
+        })
+      }
+      if (type === "whisper") {
+        console.log("voice transcribe start", {
+          provider: "whisper",
+          bytes: buffer.byteLength,
+          url: voice?.whisper?.url,
+          model: voice?.whisper?.model,
+          language: voice?.whisper?.language,
+        })
+      }
       state.cancelled = false
       state.controller = new AbortController()
-      const result = await Whisper.transcribe({
-        file: apiFile,
-        mime,
-        sessionID: input.sessionID?.(),
-        prompt: input.prompt?.(),
-        signal: state.controller.signal,
-        voice: input.transcription?.(),
-      })
+      const result = await (type === "alm"
+        ? Alm.transcribe({
+            file: apiFile,
+            mime,
+            sessionID: input.sessionID?.(),
+            prompt: input.prompt?.(),
+            signal: state.controller.signal,
+            voice,
+          })
+        : Whisper.transcribe({
+            file: apiFile,
+            mime,
+            sessionID: input.sessionID?.(),
+            prompt: input.prompt?.(),
+            signal: state.controller.signal,
+            voice,
+          }))
         .then((response) => ({ text: response.text, cancelled: false }))
         .catch((error) => {
-          console.log("whisper transcribe failed", { error: String(error) })
+          console.log("voice transcribe failed", { error: String(error), provider: type })
           if (error?.name === "AbortError" || state.cancelled) return { text: "", cancelled: true }
           throw error
         })
diff --git a/packages/opencode/src/config/config.ts b/packages/opencode/src/config/config.ts
index 864b69c8eafa..d12b72f879d3 100644
--- a/packages/opencode/src/config/config.ts
+++ b/packages/opencode/src/config/config.ts
@@ -944,6 +944,7 @@ export namespace Config {
 
   export const Voice = z
     .object({
+      type: z.enum(["whisper", "alm"]).optional().describe("Transcription provider type"),
       whisper: z
         .object({
           url: z.string().optional().describe("Whisper API URL"),
@@ -953,6 +954,16 @@ export namespace Config {
         })
         .optional()
         .describe("Whisper transcription settings"),
+      alm: z
+        .object({
+          url: z.string().optional().describe("Audio LM API URL"),
+          apiKey: z.string().optional().describe("Audio LM API key"),
+          model: z.string().optional().describe("Audio LM model name"),
+          prompt: z.string().optional().describe("Audio LM base prompt"),
+          system: z.string().optional().describe("Audio LM system prompt"),
+        })
+        .optional()
+        .describe("Audio language model transcription settings"),
     })
     .describe("Voice transcription settings")
 
diff --git a/packages/opencode/src/server/routes/voice.ts b/packages/opencode/src/server/routes/voice.ts
index f044a4d506fe..548316e49062 100644
--- a/packages/opencode/src/server/routes/voice.ts
+++ b/packages/opencode/src/server/routes/voice.ts
@@ -1,16 +1,27 @@
 import { describeRoute, resolver } from "hono-openapi"
 import { zValidator } from "@hono/zod-validator"
 import z from "zod"
+import { Config } from "@/config/config"
+import { Alm } from "@/voice/alm"
 import { Whisper } from "@/voice/whisper"
 import { lazy } from "@/util/lazy"
 import { Hono } from "hono"
 
+const resolveType = (voice?: Config.Info["voice"]) => {
+  if (voice?.type) return voice.type
+  if (voice?.whisper?.apiKey && !voice?.alm?.apiKey) return "whisper"
+  if (voice?.alm?.apiKey && !voice?.whisper?.apiKey) return "alm"
+  if (voice?.whisper?.apiKey) return "whisper"
+  if (voice?.alm?.apiKey) return "alm"
+  return "whisper"
+}
+
 export const VoiceRoutes = lazy(() =>
   new Hono().post(
     "/transcribe",
     describeRoute({
       summary: "Transcribe audio",
-      description: "Transcribe an audio file with Whisper",
+      description: "Transcribe an audio file with Whisper or an audio language model",
       operationId: "audio.transcribe",
       responses: {
         200: {
@@ -34,12 +45,24 @@ export const VoiceRoutes = lazy(() =>
     async (c) => {
       const data = c.req.valid("form")
       const file = data.file
-      const result = await Whisper.transcribe({
-        file,
-        mime: file.type || "audio/wav",
-        sessionID: data.sessionID,
-        prompt: data.prompt,
-      })
+      const mime = file.type || "audio/wav"
+      const voice = (await Config.get()).voice
+      const type = resolveType(voice)
+      const result = await (type === "alm"
+        ? Alm.transcribe({
+            file,
+            mime,
+            sessionID: data.sessionID,
+            prompt: data.prompt,
+            voice,
+          })
+        : Whisper.transcribe({
+            file,
+            mime,
+            sessionID: data.sessionID,
+            prompt: data.prompt,
+            voice,
+          }))
       return c.json(result)
     },
   ),
diff --git a/packages/opencode/src/voice/alm.ts b/packages/opencode/src/voice/alm.ts
new file mode 100644
index 000000000000..de720e88c15f
--- /dev/null
+++ b/packages/opencode/src/voice/alm.ts
@@ -0,0 +1,100 @@
+import z from "zod"
+import { Config } from "@/config/config"
+import { buildPrompt, getLastAssistantText, toWavOrMp3 } from "@/voice/whisper"
+
+const buildMessages = (input: {
+  system?: string
+  context?: string
+  audio: string
+}) => {
+  const system = (input.system ?? "You are a professional speech-to-text transcriber. Your task is to transcribe the audio into text.").trim()
+  const context = input.context?.trim()
+  const text = context
+    ? `${system}\n<context>\n${context}\n</context>\nDO NOT answer user's question, just transcribe the audio into text.`
+    : system
+  return [
+    {
+      role: "system" as const,
+      content: text,
+    },
+    {
+      role: "user" as const,
+      content: [
+        { type: "audio_url", audio_url: { url: input.audio } },
+        { type: "text", text: "you are a professional speech to text transcriber, your task is to transcribe the audio into text." },
+      ],
+    },
+  ]
+}
+
+export namespace Alm {
+  export const Request = z.object({
+    file: z.instanceof(File),
+    mime: z.string(),
+    sessionID: z.string().optional(),
+    prompt: z.string().optional(),
+  })
+
+  export const Response = z.object({
+    text: z.string().default(""),
+  })
+
+  export type Response = z.infer<typeof Response>
+
+  export async function transcribe(
+    input: z.infer<typeof Request> & { signal?: AbortSignal; voice?: Config.Info["voice"] },
+  ) {
+    const voice = input.voice ?? (await Config.get()).voice
+    const alm = voice?.alm
+    const apiKey = alm?.apiKey
+    if (!apiKey) {
+      throw new Error("Missing voice.alm.apiKey")
+    }
+
+    const content = await input.file.arrayBuffer()
+    const prepared = await toWavOrMp3({ buffer: content, mime: input.mime })
+    const audio = `data:${prepared.mime};base64,${Buffer.from(prepared.buffer).toString("base64")}`
+
+    const assistant = await getLastAssistantText(input.sessionID)
+    const context = buildPrompt({ assistant, prompt: buildPrompt({ assistant: alm?.prompt, prompt: input.prompt }) })
+    const messages = buildMessages({
+      system: alm?.system,
+      context,
+      audio,
+    })
+
+    const payload = {
+      model: alm?.model ?? "gpt-4o-mini-transcribe",
+      messages,
+      temperature: 0,
+    }
+
+    const url = alm?.url ?? "https://api.openai.com/v1/chat/completions"
+    console.log("alm request", {
+      url,
+      model: payload.model,
+      bytes: prepared.buffer.byteLength,
+    })
+
+    const result = await fetch(url, {
+      method: "POST",
+      headers: {
+        "Content-Type": "application/json",
+        Authorization: `Bearer ${apiKey}`,
+      },
+      body: JSON.stringify(payload),
+      signal: input.signal,
+    })
+
+    if (!result.ok) {
+      const message = await result.text().catch(() => "")
+      throw new Error(message || "ALM request failed")
+    }
+
+    const body = await result.text().catch(() => "")
+    console.log("alm response", { body })
+    const parsed = body ? JSON.parse(body) : {}
+    const text = parsed?.choices?.[0]?.message?.content
+    return Response.parse({ text: typeof text === "string" ? text : "" })
+  }
+}
diff --git a/packages/opencode/src/voice/whisper.ts b/packages/opencode/src/voice/whisper.ts
index 8a3c1f91f424..7542f207b70a 100644
--- a/packages/opencode/src/voice/whisper.ts
+++ b/packages/opencode/src/voice/whisper.ts
@@ -4,7 +4,7 @@ import { tmpdir } from "os"
 import path from "path"
 import z from "zod"
 
-const toWavOrMp3 = async (input: { buffer: ArrayBuffer; mime: string }) => {
+export const toWavOrMp3 = async (input: { buffer: ArrayBuffer; mime: string }) => {
   const isWav = input.mime.includes("wav")
   const isMp3 = input.mime.includes("mpeg") || input.mime.includes("mp3")
   if (isWav || isMp3) {
@@ -47,7 +47,7 @@ const toWavOrMp3 = async (input: { buffer: ArrayBuffer; mime: string }) => {
   return { buffer, name: "audio.mp3", mime: "audio/mpeg" }
 }
 
-const getLastAssistantText = async (sessionID?: string) => {
+export const getLastAssistantText = async (sessionID?: string) => {
   if (!sessionID) return ""
   return Promise.resolve()
     .then(() => Session.messages({ sessionID, limit: 50 }))
@@ -70,7 +70,7 @@ const getLastAssistantText = async (sessionID?: string) => {
     })
 }
 
-const buildPrompt = (input: { prompt?: string; assistant?: string }) => {
+export const buildPrompt = (input: { prompt?: string; assistant?: string }) => {
   const head = input.assistant?.trim() ?? ""
   const tail = input.prompt?.trim() ?? ""
   if (!head) return tail
diff --git a/packages/web/src/content/docs/config.mdx b/packages/web/src/content/docs/config.mdx
index 5433a0c73cba..4228fffe39a3 100644
--- a/packages/web/src/content/docs/config.mdx
+++ b/packages/web/src/content/docs/config.mdx
@@ -181,12 +181,13 @@ Available options:
 
 ### Voice
 
-Configure voice transcription for the Whisper API with the `voice` option.
+Configure voice transcription for Whisper or ALM with the `voice` option.
 
 ```json title="opencode.json"
 {
   "$schema": "https://opencode.ai/config.json",
   "voice": {
+    "type": "whisper",
     "whisper": {
       "url": "http://127.0.0.1:5000/v1/audio/transcriptions",
       "apiKey": "{env:OPENCODE_WHISPER_API_KEY}",
@@ -197,12 +198,34 @@ Configure voice transcription for the Whisper API with the `voice` option.
 }
 ```
 
+```json title="opencode.json"
+{
+  "$schema": "https://opencode.ai/config.json",
+  "voice": {
+    "type": "alm",
+    "alm": {
+      "url": "https://api.openai.com/v1/chat/completions",
+      "apiKey": "{env:OPENCODE_ALM_API_KEY}",
+      "model": "gpt-4o-mini-transcribe",
+      "system": "You are a professional speech-to-text transcriber. Your task is to transcribe the audio into text.",
+      "prompt": "Keep technical terms unchanged."
+    }
+  }
+}
+```
+
 Available options:
 
+- `type` - Transcription provider (`whisper` or `alm`).
 - `whisper.url` - Whisper transcription endpoint URL.
 - `whisper.apiKey` - API key for the Whisper service.
 - `whisper.model` - Whisper model name (default: `whisper-1`).
 - `whisper.language` - Optional language hint (e.g. `en`).
+- `alm.url` - Audio LM transcription endpoint URL.
+- `alm.apiKey` - API key for the ALM service.
+- `alm.model` - Audio LM model name.
+- `alm.prompt` - Optional base prompt for transcription.
+- `alm.system` - Optional system prompt for transcription.
 
 ---
 

From 8381836ddaf915f53b44f9eda970eba8d717bc3d Mon Sep 17 00:00:00 2001
From: heimoshuiyu <heimoshuiyu@gmail.com>
Date: Thu, 5 Feb 2026 15:45:29 +0800
Subject: [PATCH 3/7] Add web deploy skill and configurable web proxy

---
 .opencode/skill/web-s3-deploy/SKILL.md | 30 ++++++++++++++++++++++++++
 packages/opencode/src/flag/flag.ts     |  1 +
 packages/opencode/src/server/server.ts |  7 +++---
 3 files changed, 35 insertions(+), 3 deletions(-)
 create mode 100644 .opencode/skill/web-s3-deploy/SKILL.md

diff --git a/.opencode/skill/web-s3-deploy/SKILL.md b/.opencode/skill/web-s3-deploy/SKILL.md
new file mode 100644
index 000000000000..e4e5cc585590
--- /dev/null
+++ b/.opencode/skill/web-s3-deploy/SKILL.md
@@ -0,0 +1,30 @@
+---
+name: web-s3-deploy
+description: Build the web frontend, sync to S3, and invalidate CloudFront
+---
+
+## What I do
+Provide a repeatable workflow to publish the web frontend to a public S3 bucket and refresh a CloudFront distribution so HTTPS updates are visible.
+
+## When to use me
+Use this when you need to ship a new web UI build for OpenCode and make sure CloudFront serves the latest assets.
+
+## Checklist
+1. Build the frontend locally.
+2. Sync the build output to the S3 bucket.
+3. Trigger a CloudFront invalidation to refresh cached assets.
+
+## Commands
+```bash
+bun run --cwd packages/app build
+aws s3 sync packages/app/dist s3://opencode-hmsy --delete --exact-timestamps
+aws cloudfront create-invalidation --distribution-id E30UYS44QZ0UX4 --paths "/*"
+```
+
+## Notes
+- S3 website URL: http://opencode-hmsy.s3-website-ap-southeast-1.amazonaws.com
+- CloudFront HTTPS URL: https://d3ir6x3lfy3u68.cloudfront.net
+- OPENCODE_WEB_URL=https://d3ir6x3lfy3u68.cloudfront.net
+- For S3 website hosting, ensure the bucket policy allows public read.
+- The CloudFront distribution should use the S3 website endpoint as its origin for SPA routing.
+- If you only need cache refresh after content changes, you can skip the build step.
diff --git a/packages/opencode/src/flag/flag.ts b/packages/opencode/src/flag/flag.ts
index b11058b34058..55d94390f1f3 100644
--- a/packages/opencode/src/flag/flag.ts
+++ b/packages/opencode/src/flag/flag.ts
@@ -30,6 +30,7 @@ export namespace Flag {
   export declare const OPENCODE_CLIENT: string
   export const OPENCODE_SERVER_PASSWORD = process.env["OPENCODE_SERVER_PASSWORD"]
   export const OPENCODE_SERVER_USERNAME = process.env["OPENCODE_SERVER_USERNAME"]
+  export const OPENCODE_WEB_URL = process.env["OPENCODE_WEB_URL"]
 
   // Experimental
   export const OPENCODE_EXPERIMENTAL = truthy("OPENCODE_EXPERIMENTAL")
diff --git a/packages/opencode/src/server/server.ts b/packages/opencode/src/server/server.ts
index e01aa4276448..0875f808f4ce 100644
--- a/packages/opencode/src/server/server.ts
+++ b/packages/opencode/src/server/server.ts
@@ -534,12 +534,13 @@ export namespace Server {
         )
         .all("/*", async (c) => {
           const path = c.req.path
-
-          const response = await proxy(`https://app.opencode.ai${path}`, {
+          // Fork override: default web URL points to personal CloudFront; upstream default was https://app.opencode.ai
+          const target = Flag.OPENCODE_WEB_URL ?? "https://d3ir6x3lfy3u68.cloudfront.net"
+          const response = await proxy(`${target}${path}`, {
             ...c.req,
             headers: {
               ...c.req.raw.headers,
-              host: "app.opencode.ai",
+              host: new URL(target).host,
             },
           })
           response.headers.set(

From a5f2fe2a489b039748cba0cd3bb51f81f6a600f1 Mon Sep 17 00:00:00 2001
From: heimoshuiyu <heimoshuiyu@gmail.com>
Date: Mon, 19 Jan 2026 02:25:46 +0800
Subject: [PATCH 4/7] Update default Whisper URL

---
 packages/opencode/src/voice/whisper.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/opencode/src/voice/whisper.ts b/packages/opencode/src/voice/whisper.ts
index 7542f207b70a..b9a7fa77ac8b 100644
--- a/packages/opencode/src/voice/whisper.ts
+++ b/packages/opencode/src/voice/whisper.ts
@@ -122,7 +122,7 @@ export namespace Whisper {
       form.append("prompt", prompt)
     }
 
-    const url = whisper?.url ?? "http://127.0.0.1:5000/v1/audio/transcriptions"
+    const url = whisper?.url ?? "https://api.openai.com/v1/audio/transcriptions"
     console.log("whisper request", {
       url,
       model: whisper?.model ?? "whisper-1",

From cfbe583465ab7001d7d4f1563f1343bc7c1e1029 Mon Sep 17 00:00:00 2001
From: heimoshuiyu <heimoshuiyu@gmail.com>
Date: Thu, 22 Jan 2026 18:28:19 +0800
Subject: [PATCH 5/7] feat: show spinner while transcribing

---
 packages/app/src/components/prompt-input.tsx | 21 +++++++++++++-------
 packages/ui/src/components/icon-button.tsx   |  7 ++++---
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/packages/app/src/components/prompt-input.tsx b/packages/app/src/components/prompt-input.tsx
index 198460c9323a..42d2824896ed 100644
--- a/packages/app/src/components/prompt-input.tsx
+++ b/packages/app/src/components/prompt-input.tsx
@@ -38,6 +38,7 @@ import { ProviderIcon } from "@opencode-ai/ui/provider-icon"
 import type { IconName } from "@opencode-ai/ui/icons/provider"
 import { Tooltip, TooltipKeybind } from "@opencode-ai/ui/tooltip"
 import { IconButton } from "@opencode-ai/ui/icon-button"
+import { Spinner } from "@opencode-ai/ui/spinner"
 import { Select } from "@opencode-ai/ui/select"
 import { getDirectory, getFilename, getFilenameTruncated } from "@opencode-ai/util/path"
 import { useDialog } from "@opencode-ai/ui/context/dialog"
@@ -2265,13 +2266,19 @@ export const PromptInput: Component<PromptInputProps> = (props) => {
               </Show>
             </div>
             <TooltipKeybind placement="top" title={voiceTitle()} keybind={command.keybind("prompt.voice")}>
-              <IconButton
-                type="button"
-                icon={transcribing() || recording() ? "stop" : "mic"}
-                variant="ghost"
-                class="h-6 w-6"
-                onClick={toggleVoice}
-              />
+              <Button type="button" variant="ghost" class="h-6 w-6" onClick={toggleVoice}>
+                <Switch>
+                  <Match when={transcribing()}>
+                    <Spinner class="size-4 text-icon-base" />
+                  </Match>
+                  <Match when={recording()}>
+                    <Icon name="stop" size="small" />
+                  </Match>
+                  <Match when={true}>
+                    <Icon name="mic" size="small" />
+                  </Match>
+                </Switch>
+              </Button>
             </TooltipKeybind>
             <Tooltip
               placement="top"
diff --git a/packages/ui/src/components/icon-button.tsx b/packages/ui/src/components/icon-button.tsx
index f1832ce7ffdf..a30e722fd633 100644
--- a/packages/ui/src/components/icon-button.tsx
+++ b/packages/ui/src/components/icon-button.tsx
@@ -1,5 +1,5 @@
 import { Button as Kobalte } from "@kobalte/core/button"
-import { type ComponentProps, splitProps } from "solid-js"
+import { type ComponentProps, children, splitProps } from "solid-js"
 import { Icon, IconProps } from "./icon"
 
 export interface IconButtonProps extends ComponentProps<typeof Kobalte> {
@@ -10,7 +10,8 @@ export interface IconButtonProps extends ComponentProps<typeof Kobalte> {
 }
 
 export function IconButton(props: ComponentProps<"button"> & IconButtonProps) {
-  const [split, rest] = splitProps(props, ["variant", "size", "iconSize", "class", "classList"])
+  const [split, rest] = splitProps(props, ["variant", "size", "iconSize", "class", "classList", "children"])
+  const content = children(() => split.children)
   return (
     <Kobalte
       {...rest}
@@ -22,7 +23,7 @@ export function IconButton(props: ComponentProps<"button"> & IconButtonProps) {
         [split.class ?? ""]: !!split.class,
       }}
     >
-      <Icon name={props.icon} size={split.iconSize ?? (split.size === "large" ? "normal" : "small")} />
+      {content() ?? <Icon name={props.icon} size={split.iconSize ?? (split.size === "large" ? "normal" : "small")} />}
     </Kobalte>
   )
 }

From bcf0765314f237bf6a725370f902f509a4578c77 Mon Sep 17 00:00:00 2001
From: heimoshuiyu <heimoshuiyu@gmail.com>
Date: Wed, 21 Jan 2026 01:45:03 +0800
Subject: [PATCH 6/7] Fix voice input insertion by ensuring selection is in
 editor

When voice transcription completes, addPart now checks if the current
selection is within the prompt editor. If the selection is outside the
editor (e.g., user clicked on an assistant message during recording),
it focuses the editor and restores the cursor position from prompt.cursor()
before inserting the transcribed text. This prevents transcription results
from being inserted into unintended locations like assistant messages.

Also fixes cursor position logic to prefer real DOM position when
selection is inside the editor, only falling back to prompt.cursor()
when selection is outside.
---
 packages/app/src/components/prompt-input.tsx | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/packages/app/src/components/prompt-input.tsx b/packages/app/src/components/prompt-input.tsx
index 42d2824896ed..9c41c25e8832 100644
--- a/packages/app/src/components/prompt-input.tsx
+++ b/packages/app/src/components/prompt-input.tsx
@@ -1071,9 +1071,19 @@ export const PromptInput: Component<PromptInputProps> = (props) => {
 
   const addPart = (part: ContentPart) => {
     const selection = window.getSelection()
-    if (!selection || selection.rangeCount === 0) return
+    if (!selection) return
+
+    const hasRange = selection.rangeCount > 0
+    const inEditor = hasRange && editorRef.contains(selection.anchorNode)
+    const cursorPosition = inEditor
+      ? getCursorPosition(editorRef)
+      : (prompt.cursor() ?? getCursorPosition(editorRef))
+    if (!inEditor) {
+      editorRef.focus()
+      setCursorPosition(editorRef, cursorPosition)
+    }
+    if (selection.rangeCount === 0) return
 
-    const cursorPosition = getCursorPosition(editorRef)
     const currentPrompt = prompt.current()
     const rawText = currentPrompt.map((p) => ("content" in p ? p.content : "")).join("")
     const textBeforeCursor = rawText.substring(0, cursorPosition)

From 675d657c3d4929404a1933752f679ee9083e984c Mon Sep 17 00:00:00 2001
From: heimoshuiyu <heimoshuiyu@gmail.com>
Date: Thu, 22 Jan 2026 19:16:51 +0800
Subject: [PATCH 7/7] fix(tui): show warning toast when clicking disabled voice
 button

---
 .../src/cli/cmd/tui/component/prompt/index.tsx | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/packages/opencode/src/cli/cmd/tui/component/prompt/index.tsx b/packages/opencode/src/cli/cmd/tui/component/prompt/index.tsx
index ec45e0feab67..2b7a4dd6d856 100644
--- a/packages/opencode/src/cli/cmd/tui/component/prompt/index.tsx
+++ b/packages/opencode/src/cli/cmd/tui/component/prompt/index.tsx
@@ -1110,13 +1110,19 @@ export function Prompt(props: PromptProps) {
                 </box>
               </Show>
               <box flexGrow={1} />
-              <box
-                flexDirection="row"
-                onMouseUp={async () => {
-                  if (!voiceEnabled() && !store.recording && !store.processing) return
+                <box
+                  flexDirection="row"
+                  onMouseUp={async () => {
+                  if (!voiceEnabled() && !store.recording && !store.processing) {
+                    toast.show({
+                      message: "Voice input unavailable (missing transcription API key)",
+                      variant: "warning",
+                    })
+                    return
+                  }
                   await toggleVoice()
-                }}
-              >
+                  }}
+                >
                 <text fg={voiceColor()}>{voiceLabel()}</text>
               </box>
             </box>