diff --git a/codex-rs/app-server-protocol/schema/json/ClientRequest.json b/codex-rs/app-server-protocol/schema/json/ClientRequest.json index 094e631f20f..2e45005f51b 100644 --- a/codex-rs/app-server-protocol/schema/json/ClientRequest.json +++ b/codex-rs/app-server-protocol/schema/json/ClientRequest.json @@ -1499,6 +1499,13 @@ } ] }, + "RealtimeOutputModality": { + "enum": [ + "text", + "audio" + ], + "type": "string" + }, "RealtimeVoice": { "enum": [ "alloy", diff --git a/codex-rs/app-server-protocol/schema/json/ServerNotification.json b/codex-rs/app-server-protocol/schema/json/ServerNotification.json index c3ab83766a5..4edff15748a 100644 --- a/codex-rs/app-server-protocol/schema/json/ServerNotification.json +++ b/codex-rs/app-server-protocol/schema/json/ServerNotification.json @@ -3384,13 +3384,35 @@ ], "type": "object" }, - "ThreadRealtimeTranscriptUpdatedNotification": { + "ThreadRealtimeTranscriptDeltaNotification": { "description": "EXPERIMENTAL - flat transcript delta emitted whenever realtime transcript text changes.", + "properties": { + "delta": { + "description": "Live transcript delta from the realtime event.", + "type": "string" + }, + "role": { + "type": "string" + }, + "threadId": { + "type": "string" + } + }, + "required": [ + "delta", + "role", + "threadId" + ], + "type": "object" + }, + "ThreadRealtimeTranscriptDoneNotification": { + "description": "EXPERIMENTAL - final transcript text emitted when realtime completes a transcript part.", "properties": { "role": { "type": "string" }, "text": { + "description": "Final complete text for the transcript part.", "type": "string" }, "threadId": { @@ -4949,20 +4971,40 @@ "properties": { "method": { "enum": [ - "thread/realtime/transcriptUpdated" + "thread/realtime/transcript/delta" + ], + "title": "Thread/realtime/transcript/deltaNotificationMethod", + "type": "string" + }, + "params": { + "$ref": "#/definitions/ThreadRealtimeTranscriptDeltaNotification" + } + }, + "required": [ + "method", + "params" + ], + "title": "Thread/realtime/transcript/deltaNotification", + "type": "object" + }, + { + "properties": { + "method": { + "enum": [ + "thread/realtime/transcript/done" ], - "title": "Thread/realtime/transcriptUpdatedNotificationMethod", + "title": "Thread/realtime/transcript/doneNotificationMethod", "type": "string" }, "params": { - "$ref": "#/definitions/ThreadRealtimeTranscriptUpdatedNotification" + "$ref": "#/definitions/ThreadRealtimeTranscriptDoneNotification" } }, "required": [ "method", "params" ], - "title": "Thread/realtime/transcriptUpdatedNotification", + "title": "Thread/realtime/transcript/doneNotification", "type": "object" }, { diff --git a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json index 45038abf088..00b922c999d 100644 --- a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json +++ b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json @@ -4355,20 +4355,40 @@ "properties": { "method": { "enum": [ - "thread/realtime/transcriptUpdated" + "thread/realtime/transcript/delta" ], - "title": "Thread/realtime/transcriptUpdatedNotificationMethod", + "title": "Thread/realtime/transcript/deltaNotificationMethod", "type": "string" }, "params": { - "$ref": "#/definitions/v2/ThreadRealtimeTranscriptUpdatedNotification" + "$ref": "#/definitions/v2/ThreadRealtimeTranscriptDeltaNotification" } }, "required": [ "method", "params" ], - "title": "Thread/realtime/transcriptUpdatedNotification", + "title": "Thread/realtime/transcript/deltaNotification", + "type": "object" + }, + { + "properties": { + "method": { + "enum": [ + "thread/realtime/transcript/done" + ], + "title": "Thread/realtime/transcript/doneNotificationMethod", + "type": "string" + }, + "params": { + "$ref": "#/definitions/v2/ThreadRealtimeTranscriptDoneNotification" + } + }, + "required": [ + "method", + "params" + ], + "title": "Thread/realtime/transcript/doneNotification", "type": "object" }, { @@ -10640,6 +10660,13 @@ ], "type": "string" }, + "RealtimeOutputModality": { + "enum": [ + "text", + "audio" + ], + "type": "string" + }, "RealtimeVoice": { "enum": [ "alloy", @@ -14010,14 +14037,38 @@ "title": "ThreadRealtimeStartedNotification", "type": "object" }, - "ThreadRealtimeTranscriptUpdatedNotification": { + "ThreadRealtimeTranscriptDeltaNotification": { "$schema": "http://json-schema.org/draft-07/schema#", "description": "EXPERIMENTAL - flat transcript delta emitted whenever realtime transcript text changes.", + "properties": { + "delta": { + "description": "Live transcript delta from the realtime event.", + "type": "string" + }, + "role": { + "type": "string" + }, + "threadId": { + "type": "string" + } + }, + "required": [ + "delta", + "role", + "threadId" + ], + "title": "ThreadRealtimeTranscriptDeltaNotification", + "type": "object" + }, + "ThreadRealtimeTranscriptDoneNotification": { + "$schema": "http://json-schema.org/draft-07/schema#", + "description": "EXPERIMENTAL - final transcript text emitted when realtime completes a transcript part.", "properties": { "role": { "type": "string" }, "text": { + "description": "Final complete text for the transcript part.", "type": "string" }, "threadId": { @@ -14029,7 +14080,7 @@ "text", "threadId" ], - "title": "ThreadRealtimeTranscriptUpdatedNotification", + "title": "ThreadRealtimeTranscriptDoneNotification", "type": "object" }, "ThreadResumeParams": { diff --git a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json index a2f40d9827a..0785a738649 100644 --- a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json +++ b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json @@ -7436,6 +7436,13 @@ ], "type": "string" }, + "RealtimeOutputModality": { + "enum": [ + "text", + "audio" + ], + "type": "string" + }, "RealtimeVoice": { "enum": [ "alloy", @@ -9605,20 +9612,40 @@ "properties": { "method": { "enum": [ - "thread/realtime/transcriptUpdated" + "thread/realtime/transcript/delta" + ], + "title": "Thread/realtime/transcript/deltaNotificationMethod", + "type": "string" + }, + "params": { + "$ref": "#/definitions/ThreadRealtimeTranscriptDeltaNotification" + } + }, + "required": [ + "method", + "params" + ], + "title": "Thread/realtime/transcript/deltaNotification", + "type": "object" + }, + { + "properties": { + "method": { + "enum": [ + "thread/realtime/transcript/done" ], - "title": "Thread/realtime/transcriptUpdatedNotificationMethod", + "title": "Thread/realtime/transcript/doneNotificationMethod", "type": "string" }, "params": { - "$ref": "#/definitions/ThreadRealtimeTranscriptUpdatedNotification" + "$ref": "#/definitions/ThreadRealtimeTranscriptDoneNotification" } }, "required": [ "method", "params" ], - "title": "Thread/realtime/transcriptUpdatedNotification", + "title": "Thread/realtime/transcript/doneNotification", "type": "object" }, { @@ -11858,14 +11885,38 @@ "title": "ThreadRealtimeStartedNotification", "type": "object" }, - "ThreadRealtimeTranscriptUpdatedNotification": { + "ThreadRealtimeTranscriptDeltaNotification": { "$schema": "http://json-schema.org/draft-07/schema#", "description": "EXPERIMENTAL - flat transcript delta emitted whenever realtime transcript text changes.", + "properties": { + "delta": { + "description": "Live transcript delta from the realtime event.", + "type": "string" + }, + "role": { + "type": "string" + }, + "threadId": { + "type": "string" + } + }, + "required": [ + "delta", + "role", + "threadId" + ], + "title": "ThreadRealtimeTranscriptDeltaNotification", + "type": "object" + }, + "ThreadRealtimeTranscriptDoneNotification": { + "$schema": "http://json-schema.org/draft-07/schema#", + "description": "EXPERIMENTAL - final transcript text emitted when realtime completes a transcript part.", "properties": { "role": { "type": "string" }, "text": { + "description": "Final complete text for the transcript part.", "type": "string" }, "threadId": { @@ -11877,7 +11928,7 @@ "text", "threadId" ], - "title": "ThreadRealtimeTranscriptUpdatedNotification", + "title": "ThreadRealtimeTranscriptDoneNotification", "type": "object" }, "ThreadResumeParams": { diff --git a/codex-rs/app-server-protocol/schema/json/v2/ThreadRealtimeTranscriptUpdatedNotification.json b/codex-rs/app-server-protocol/schema/json/v2/ThreadRealtimeTranscriptDeltaNotification.json similarity index 70% rename from codex-rs/app-server-protocol/schema/json/v2/ThreadRealtimeTranscriptUpdatedNotification.json rename to codex-rs/app-server-protocol/schema/json/v2/ThreadRealtimeTranscriptDeltaNotification.json index 2c6860fa312..22ad778eb2a 100644 --- a/codex-rs/app-server-protocol/schema/json/v2/ThreadRealtimeTranscriptUpdatedNotification.json +++ b/codex-rs/app-server-protocol/schema/json/v2/ThreadRealtimeTranscriptDeltaNotification.json @@ -2,10 +2,11 @@ "$schema": "http://json-schema.org/draft-07/schema#", "description": "EXPERIMENTAL - flat transcript delta emitted whenever realtime transcript text changes.", "properties": { - "role": { + "delta": { + "description": "Live transcript delta from the realtime event.", "type": "string" }, - "text": { + "role": { "type": "string" }, "threadId": { @@ -13,10 +14,10 @@ } }, "required": [ + "delta", "role", - "text", "threadId" ], - "title": "ThreadRealtimeTranscriptUpdatedNotification", + "title": "ThreadRealtimeTranscriptDeltaNotification", "type": "object" } \ No newline at end of file diff --git a/codex-rs/app-server-protocol/schema/json/v2/ThreadRealtimeTranscriptDoneNotification.json b/codex-rs/app-server-protocol/schema/json/v2/ThreadRealtimeTranscriptDoneNotification.json new file mode 100644 index 00000000000..2f4199fdb9e --- /dev/null +++ b/codex-rs/app-server-protocol/schema/json/v2/ThreadRealtimeTranscriptDoneNotification.json @@ -0,0 +1,23 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "description": "EXPERIMENTAL - final transcript text emitted when realtime completes a transcript part.", + "properties": { + "role": { + "type": "string" + }, + "text": { + "description": "Final complete text for the transcript part.", + "type": "string" + }, + "threadId": { + "type": "string" + } + }, + "required": [ + "role", + "text", + "threadId" + ], + "title": "ThreadRealtimeTranscriptDoneNotification", + "type": "object" +} \ No newline at end of file diff --git a/codex-rs/app-server-protocol/schema/typescript/RealtimeOutputModality.ts b/codex-rs/app-server-protocol/schema/typescript/RealtimeOutputModality.ts new file mode 100644 index 00000000000..78e00e7143d --- /dev/null +++ b/codex-rs/app-server-protocol/schema/typescript/RealtimeOutputModality.ts @@ -0,0 +1,5 @@ +// GENERATED CODE! DO NOT MODIFY BY HAND! + +// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually. + +export type RealtimeOutputModality = "text" | "audio"; diff --git a/codex-rs/app-server-protocol/schema/typescript/ServerNotification.ts b/codex-rs/app-server-protocol/schema/typescript/ServerNotification.ts index a9859141342..1db7027febf 100644 --- a/codex-rs/app-server-protocol/schema/typescript/ServerNotification.ts +++ b/codex-rs/app-server-protocol/schema/typescript/ServerNotification.ts @@ -43,7 +43,8 @@ import type { ThreadRealtimeItemAddedNotification } from "./v2/ThreadRealtimeIte import type { ThreadRealtimeOutputAudioDeltaNotification } from "./v2/ThreadRealtimeOutputAudioDeltaNotification"; import type { ThreadRealtimeSdpNotification } from "./v2/ThreadRealtimeSdpNotification"; import type { ThreadRealtimeStartedNotification } from "./v2/ThreadRealtimeStartedNotification"; -import type { ThreadRealtimeTranscriptUpdatedNotification } from "./v2/ThreadRealtimeTranscriptUpdatedNotification"; +import type { ThreadRealtimeTranscriptDeltaNotification } from "./v2/ThreadRealtimeTranscriptDeltaNotification"; +import type { ThreadRealtimeTranscriptDoneNotification } from "./v2/ThreadRealtimeTranscriptDoneNotification"; import type { ThreadStartedNotification } from "./v2/ThreadStartedNotification"; import type { ThreadStatusChangedNotification } from "./v2/ThreadStatusChangedNotification"; import type { ThreadTokenUsageUpdatedNotification } from "./v2/ThreadTokenUsageUpdatedNotification"; @@ -58,4 +59,4 @@ import type { WindowsWorldWritableWarningNotification } from "./v2/WindowsWorldW /** * Notification sent from the server to the client. */ -export type ServerNotification = { "method": "error", "params": ErrorNotification } | { "method": "thread/started", "params": ThreadStartedNotification } | { "method": "thread/status/changed", "params": ThreadStatusChangedNotification } | { "method": "thread/archived", "params": ThreadArchivedNotification } | { "method": "thread/unarchived", "params": ThreadUnarchivedNotification } | { "method": "thread/closed", "params": ThreadClosedNotification } | { "method": "skills/changed", "params": SkillsChangedNotification } | { "method": "thread/name/updated", "params": ThreadNameUpdatedNotification } | { "method": "thread/tokenUsage/updated", "params": ThreadTokenUsageUpdatedNotification } | { "method": "turn/started", "params": TurnStartedNotification } | { "method": "hook/started", "params": HookStartedNotification } | { "method": "turn/completed", "params": TurnCompletedNotification } | { "method": "hook/completed", "params": HookCompletedNotification } | { "method": "turn/diff/updated", "params": TurnDiffUpdatedNotification } | { "method": "turn/plan/updated", "params": TurnPlanUpdatedNotification } | { "method": "item/started", "params": ItemStartedNotification } | { "method": "item/autoApprovalReview/started", "params": ItemGuardianApprovalReviewStartedNotification } | { "method": "item/autoApprovalReview/completed", "params": ItemGuardianApprovalReviewCompletedNotification } | { "method": "item/completed", "params": ItemCompletedNotification } | { "method": "rawResponseItem/completed", "params": RawResponseItemCompletedNotification } | { "method": "item/agentMessage/delta", "params": AgentMessageDeltaNotification } | { "method": "item/plan/delta", "params": PlanDeltaNotification } | { "method": "command/exec/outputDelta", "params": CommandExecOutputDeltaNotification } | { "method": "item/commandExecution/outputDelta", "params": CommandExecutionOutputDeltaNotification } | { "method": "item/commandExecution/terminalInteraction", "params": TerminalInteractionNotification } | { "method": "item/fileChange/outputDelta", "params": FileChangeOutputDeltaNotification } | { "method": "serverRequest/resolved", "params": ServerRequestResolvedNotification } | { "method": "item/mcpToolCall/progress", "params": McpToolCallProgressNotification } | { "method": "mcpServer/oauthLogin/completed", "params": McpServerOauthLoginCompletedNotification } | { "method": "mcpServer/startupStatus/updated", "params": McpServerStatusUpdatedNotification } | { "method": "account/updated", "params": AccountUpdatedNotification } | { "method": "account/rateLimits/updated", "params": AccountRateLimitsUpdatedNotification } | { "method": "app/list/updated", "params": AppListUpdatedNotification } | { "method": "fs/changed", "params": FsChangedNotification } | { "method": "item/reasoning/summaryTextDelta", "params": ReasoningSummaryTextDeltaNotification } | { "method": "item/reasoning/summaryPartAdded", "params": ReasoningSummaryPartAddedNotification } | { "method": "item/reasoning/textDelta", "params": ReasoningTextDeltaNotification } | { "method": "thread/compacted", "params": ContextCompactedNotification } | { "method": "model/rerouted", "params": ModelReroutedNotification } | { "method": "deprecationNotice", "params": DeprecationNoticeNotification } | { "method": "configWarning", "params": ConfigWarningNotification } | { "method": "fuzzyFileSearch/sessionUpdated", "params": FuzzyFileSearchSessionUpdatedNotification } | { "method": "fuzzyFileSearch/sessionCompleted", "params": FuzzyFileSearchSessionCompletedNotification } | { "method": "thread/realtime/started", "params": ThreadRealtimeStartedNotification } | { "method": "thread/realtime/itemAdded", "params": ThreadRealtimeItemAddedNotification } | { "method": "thread/realtime/transcriptUpdated", "params": ThreadRealtimeTranscriptUpdatedNotification } | { "method": "thread/realtime/outputAudio/delta", "params": ThreadRealtimeOutputAudioDeltaNotification } | { "method": "thread/realtime/sdp", "params": ThreadRealtimeSdpNotification } | { "method": "thread/realtime/error", "params": ThreadRealtimeErrorNotification } | { "method": "thread/realtime/closed", "params": ThreadRealtimeClosedNotification } | { "method": "windows/worldWritableWarning", "params": WindowsWorldWritableWarningNotification } | { "method": "windowsSandbox/setupCompleted", "params": WindowsSandboxSetupCompletedNotification } | { "method": "account/login/completed", "params": AccountLoginCompletedNotification }; +export type ServerNotification = { "method": "error", "params": ErrorNotification } | { "method": "thread/started", "params": ThreadStartedNotification } | { "method": "thread/status/changed", "params": ThreadStatusChangedNotification } | { "method": "thread/archived", "params": ThreadArchivedNotification } | { "method": "thread/unarchived", "params": ThreadUnarchivedNotification } | { "method": "thread/closed", "params": ThreadClosedNotification } | { "method": "skills/changed", "params": SkillsChangedNotification } | { "method": "thread/name/updated", "params": ThreadNameUpdatedNotification } | { "method": "thread/tokenUsage/updated", "params": ThreadTokenUsageUpdatedNotification } | { "method": "turn/started", "params": TurnStartedNotification } | { "method": "hook/started", "params": HookStartedNotification } | { "method": "turn/completed", "params": TurnCompletedNotification } | { "method": "hook/completed", "params": HookCompletedNotification } | { "method": "turn/diff/updated", "params": TurnDiffUpdatedNotification } | { "method": "turn/plan/updated", "params": TurnPlanUpdatedNotification } | { "method": "item/started", "params": ItemStartedNotification } | { "method": "item/autoApprovalReview/started", "params": ItemGuardianApprovalReviewStartedNotification } | { "method": "item/autoApprovalReview/completed", "params": ItemGuardianApprovalReviewCompletedNotification } | { "method": "item/completed", "params": ItemCompletedNotification } | { "method": "rawResponseItem/completed", "params": RawResponseItemCompletedNotification } | { "method": "item/agentMessage/delta", "params": AgentMessageDeltaNotification } | { "method": "item/plan/delta", "params": PlanDeltaNotification } | { "method": "command/exec/outputDelta", "params": CommandExecOutputDeltaNotification } | { "method": "item/commandExecution/outputDelta", "params": CommandExecutionOutputDeltaNotification } | { "method": "item/commandExecution/terminalInteraction", "params": TerminalInteractionNotification } | { "method": "item/fileChange/outputDelta", "params": FileChangeOutputDeltaNotification } | { "method": "serverRequest/resolved", "params": ServerRequestResolvedNotification } | { "method": "item/mcpToolCall/progress", "params": McpToolCallProgressNotification } | { "method": "mcpServer/oauthLogin/completed", "params": McpServerOauthLoginCompletedNotification } | { "method": "mcpServer/startupStatus/updated", "params": McpServerStatusUpdatedNotification } | { "method": "account/updated", "params": AccountUpdatedNotification } | { "method": "account/rateLimits/updated", "params": AccountRateLimitsUpdatedNotification } | { "method": "app/list/updated", "params": AppListUpdatedNotification } | { "method": "fs/changed", "params": FsChangedNotification } | { "method": "item/reasoning/summaryTextDelta", "params": ReasoningSummaryTextDeltaNotification } | { "method": "item/reasoning/summaryPartAdded", "params": ReasoningSummaryPartAddedNotification } | { "method": "item/reasoning/textDelta", "params": ReasoningTextDeltaNotification } | { "method": "thread/compacted", "params": ContextCompactedNotification } | { "method": "model/rerouted", "params": ModelReroutedNotification } | { "method": "deprecationNotice", "params": DeprecationNoticeNotification } | { "method": "configWarning", "params": ConfigWarningNotification } | { "method": "fuzzyFileSearch/sessionUpdated", "params": FuzzyFileSearchSessionUpdatedNotification } | { "method": "fuzzyFileSearch/sessionCompleted", "params": FuzzyFileSearchSessionCompletedNotification } | { "method": "thread/realtime/started", "params": ThreadRealtimeStartedNotification } | { "method": "thread/realtime/itemAdded", "params": ThreadRealtimeItemAddedNotification } | { "method": "thread/realtime/transcript/delta", "params": ThreadRealtimeTranscriptDeltaNotification } | { "method": "thread/realtime/transcript/done", "params": ThreadRealtimeTranscriptDoneNotification } | { "method": "thread/realtime/outputAudio/delta", "params": ThreadRealtimeOutputAudioDeltaNotification } | { "method": "thread/realtime/sdp", "params": ThreadRealtimeSdpNotification } | { "method": "thread/realtime/error", "params": ThreadRealtimeErrorNotification } | { "method": "thread/realtime/closed", "params": ThreadRealtimeClosedNotification } | { "method": "windows/worldWritableWarning", "params": WindowsWorldWritableWarningNotification } | { "method": "windowsSandbox/setupCompleted", "params": WindowsSandboxSetupCompletedNotification } | { "method": "account/login/completed", "params": AccountLoginCompletedNotification }; diff --git a/codex-rs/app-server-protocol/schema/typescript/index.ts b/codex-rs/app-server-protocol/schema/typescript/index.ts index 3f07f716958..7bbb417fdc9 100644 --- a/codex-rs/app-server-protocol/schema/typescript/index.ts +++ b/codex-rs/app-server-protocol/schema/typescript/index.ts @@ -49,6 +49,7 @@ export type { ParsedCommand } from "./ParsedCommand"; export type { Personality } from "./Personality"; export type { PlanType } from "./PlanType"; export type { RealtimeConversationVersion } from "./RealtimeConversationVersion"; +export type { RealtimeOutputModality } from "./RealtimeOutputModality"; export type { RealtimeVoice } from "./RealtimeVoice"; export type { RealtimeVoicesList } from "./RealtimeVoicesList"; export type { ReasoningEffort } from "./ReasoningEffort"; diff --git a/codex-rs/app-server-protocol/schema/typescript/v2/ThreadRealtimeTranscriptUpdatedNotification.ts b/codex-rs/app-server-protocol/schema/typescript/v2/ThreadRealtimeTranscriptDeltaNotification.ts similarity index 60% rename from codex-rs/app-server-protocol/schema/typescript/v2/ThreadRealtimeTranscriptUpdatedNotification.ts rename to codex-rs/app-server-protocol/schema/typescript/v2/ThreadRealtimeTranscriptDeltaNotification.ts index d2940029f2f..805eeddd768 100644 --- a/codex-rs/app-server-protocol/schema/typescript/v2/ThreadRealtimeTranscriptUpdatedNotification.ts +++ b/codex-rs/app-server-protocol/schema/typescript/v2/ThreadRealtimeTranscriptDeltaNotification.ts @@ -6,4 +6,8 @@ * EXPERIMENTAL - flat transcript delta emitted whenever realtime * transcript text changes. */ -export type ThreadRealtimeTranscriptUpdatedNotification = { threadId: string, role: string, text: string, }; +export type ThreadRealtimeTranscriptDeltaNotification = { threadId: string, role: string, +/** + * Live transcript delta from the realtime event. + */ +delta: string, }; diff --git a/codex-rs/app-server-protocol/schema/typescript/v2/ThreadRealtimeTranscriptDoneNotification.ts b/codex-rs/app-server-protocol/schema/typescript/v2/ThreadRealtimeTranscriptDoneNotification.ts new file mode 100644 index 00000000000..d4667ad039f --- /dev/null +++ b/codex-rs/app-server-protocol/schema/typescript/v2/ThreadRealtimeTranscriptDoneNotification.ts @@ -0,0 +1,13 @@ +// GENERATED CODE! DO NOT MODIFY BY HAND! + +// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually. + +/** + * EXPERIMENTAL - final transcript text emitted when realtime completes + * a transcript part. + */ +export type ThreadRealtimeTranscriptDoneNotification = { threadId: string, role: string, +/** + * Final complete text for the transcript part. + */ +text: string, }; diff --git a/codex-rs/app-server-protocol/schema/typescript/v2/index.ts b/codex-rs/app-server-protocol/schema/typescript/v2/index.ts index 961592db391..1dfdee29a5a 100644 --- a/codex-rs/app-server-protocol/schema/typescript/v2/index.ts +++ b/codex-rs/app-server-protocol/schema/typescript/v2/index.ts @@ -303,7 +303,8 @@ export type { ThreadRealtimeOutputAudioDeltaNotification } from "./ThreadRealtim export type { ThreadRealtimeSdpNotification } from "./ThreadRealtimeSdpNotification"; export type { ThreadRealtimeStartTransport } from "./ThreadRealtimeStartTransport"; export type { ThreadRealtimeStartedNotification } from "./ThreadRealtimeStartedNotification"; -export type { ThreadRealtimeTranscriptUpdatedNotification } from "./ThreadRealtimeTranscriptUpdatedNotification"; +export type { ThreadRealtimeTranscriptDeltaNotification } from "./ThreadRealtimeTranscriptDeltaNotification"; +export type { ThreadRealtimeTranscriptDoneNotification } from "./ThreadRealtimeTranscriptDoneNotification"; export type { ThreadResumeParams } from "./ThreadResumeParams"; export type { ThreadResumeResponse } from "./ThreadResumeResponse"; export type { ThreadRollbackParams } from "./ThreadRollbackParams"; diff --git a/codex-rs/app-server-protocol/src/protocol/common.rs b/codex-rs/app-server-protocol/src/protocol/common.rs index 7334a964ee5..757cab84e0a 100644 --- a/codex-rs/app-server-protocol/src/protocol/common.rs +++ b/codex-rs/app-server-protocol/src/protocol/common.rs @@ -1022,8 +1022,10 @@ server_notification_definitions! { ThreadRealtimeStarted => "thread/realtime/started" (v2::ThreadRealtimeStartedNotification), #[experimental("thread/realtime/itemAdded")] ThreadRealtimeItemAdded => "thread/realtime/itemAdded" (v2::ThreadRealtimeItemAddedNotification), - #[experimental("thread/realtime/transcriptUpdated")] - ThreadRealtimeTranscriptUpdated => "thread/realtime/transcriptUpdated" (v2::ThreadRealtimeTranscriptUpdatedNotification), + #[experimental("thread/realtime/transcript/delta")] + ThreadRealtimeTranscriptDelta => "thread/realtime/transcript/delta" (v2::ThreadRealtimeTranscriptDeltaNotification), + #[experimental("thread/realtime/transcript/done")] + ThreadRealtimeTranscriptDone => "thread/realtime/transcript/done" (v2::ThreadRealtimeTranscriptDoneNotification), #[experimental("thread/realtime/outputAudio/delta")] ThreadRealtimeOutputAudioDelta => "thread/realtime/outputAudio/delta" (v2::ThreadRealtimeOutputAudioDeltaNotification), #[experimental("thread/realtime/sdp")] @@ -1056,6 +1058,8 @@ mod tests { use codex_protocol::account::PlanType; use codex_protocol::parse_command::ParsedCommand; use codex_protocol::protocol::RealtimeConversationVersion; + use codex_protocol::protocol::RealtimeOutputModality; + use codex_protocol::protocol::RealtimeVoice; use codex_utils_absolute_path::AbsolutePathBuf; use pretty_assertions::assert_eq; use serde_json::json; @@ -1784,10 +1788,11 @@ mod tests { request_id: RequestId::Integer(9), params: v2::ThreadRealtimeStartParams { thread_id: "thr_123".to_string(), + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("You are on a call".to_string())), session_id: Some("sess_456".to_string()), transport: None, - voice: Some(codex_protocol::protocol::RealtimeVoice::Marin), + voice: Some(RealtimeVoice::Marin), }, }; assert_eq!( @@ -1796,6 +1801,7 @@ mod tests { "id": 9, "params": { "threadId": "thr_123", + "outputModality": "audio", "prompt": "You are on a call", "sessionId": "sess_456", "transport": null, @@ -1813,6 +1819,7 @@ mod tests { request_id: RequestId::Integer(9), params: v2::ThreadRealtimeStartParams { thread_id: "thr_123".to_string(), + output_modality: RealtimeOutputModality::Audio, prompt: None, session_id: None, transport: None, @@ -1825,6 +1832,7 @@ mod tests { "id": 9, "params": { "threadId": "thr_123", + "outputModality": "audio", "sessionId": null, "transport": null, "voice": null @@ -1837,6 +1845,7 @@ mod tests { request_id: RequestId::Integer(9), params: v2::ThreadRealtimeStartParams { thread_id: "thr_123".to_string(), + output_modality: RealtimeOutputModality::Audio, prompt: Some(None), session_id: None, transport: None, @@ -1849,6 +1858,7 @@ mod tests { "id": 9, "params": { "threadId": "thr_123", + "outputModality": "audio", "prompt": null, "sessionId": null, "transport": null, @@ -1863,6 +1873,7 @@ mod tests { "id": 9, "params": { "threadId": "thr_123", + "outputModality": "audio", "sessionId": null, "transport": null, "voice": null @@ -1878,6 +1889,7 @@ mod tests { "id": 9, "params": { "threadId": "thr_123", + "outputModality": "audio", "prompt": null, "sessionId": null, "transport": null, @@ -1962,6 +1974,7 @@ mod tests { request_id: RequestId::Integer(1), params: v2::ThreadRealtimeStartParams { thread_id: "thr_123".to_string(), + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("You are on a call".to_string())), session_id: None, transport: None, diff --git a/codex-rs/app-server-protocol/src/protocol/v2.rs b/codex-rs/app-server-protocol/src/protocol/v2.rs index 1e42fae12f2..81a58cefc49 100644 --- a/codex-rs/app-server-protocol/src/protocol/v2.rs +++ b/codex-rs/app-server-protocol/src/protocol/v2.rs @@ -74,6 +74,7 @@ use codex_protocol::protocol::RateLimitWindow as CoreRateLimitWindow; use codex_protocol::protocol::ReadOnlyAccess as CoreReadOnlyAccess; use codex_protocol::protocol::RealtimeAudioFrame as CoreRealtimeAudioFrame; use codex_protocol::protocol::RealtimeConversationVersion; +use codex_protocol::protocol::RealtimeOutputModality; use codex_protocol::protocol::RealtimeVoice; use codex_protocol::protocol::RealtimeVoicesList; use codex_protocol::protocol::ReviewDecision as CoreReviewDecision; @@ -3954,11 +3955,14 @@ impl From for CoreRealtimeAudioFrame { } /// EXPERIMENTAL - start a thread-scoped realtime session. -#[derive(Serialize, Deserialize, Debug, Default, Clone, PartialEq, JsonSchema, TS)] +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, JsonSchema, TS)] #[serde(rename_all = "camelCase")] #[ts(export_to = "v2/")] pub struct ThreadRealtimeStartParams { pub thread_id: String, + /// Selects text or audio output for the realtime session. Transport and voice stay + /// independent so clients can choose how they connect separately from what the model emits. + pub output_modality: RealtimeOutputModality, #[serde( default, deserialize_with = "super::serde_helpers::deserialize_double_option", @@ -4076,9 +4080,22 @@ pub struct ThreadRealtimeItemAddedNotification { #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, JsonSchema, TS)] #[serde(rename_all = "camelCase")] #[ts(export_to = "v2/")] -pub struct ThreadRealtimeTranscriptUpdatedNotification { +pub struct ThreadRealtimeTranscriptDeltaNotification { + pub thread_id: String, + pub role: String, + /// Live transcript delta from the realtime event. + pub delta: String, +} + +/// EXPERIMENTAL - final transcript text emitted when realtime completes +/// a transcript part. +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, JsonSchema, TS)] +#[serde(rename_all = "camelCase")] +#[ts(export_to = "v2/")] +pub struct ThreadRealtimeTranscriptDoneNotification { pub thread_id: String, pub role: String, + /// Final complete text for the transcript part. pub text: String, } diff --git a/codex-rs/app-server/README.md b/codex-rs/app-server/README.md index 7083c791169..fe3c435b84c 100644 --- a/codex-rs/app-server/README.md +++ b/codex-rs/app-server/README.md @@ -154,7 +154,7 @@ Example with notification opt-out: - `thread/inject_items` — append raw Responses API items to a loaded thread’s model-visible history without starting a user turn; returns `{}` on success. - `turn/steer` — add user input to an already in-flight regular turn without starting a new turn; returns the active `turnId` that accepted the input. Review and manual compaction turns reject `turn/steer`. - `turn/interrupt` — request cancellation of an in-flight turn by `(thread_id, turn_id)`; success is an empty `{}` response and the turn finishes with `status: "interrupted"`. -- `thread/realtime/start` — start a thread-scoped realtime session (experimental); returns `{}` and streams `thread/realtime/*` notifications. Omit `transport` for the websocket transport, or pass `{ "type": "webrtc", "sdp": "..." }` to create a WebRTC session from a browser-generated SDP offer; the remote answer SDP is emitted as `thread/realtime/sdp`. +- `thread/realtime/start` — start a thread-scoped realtime session (experimental); pass `outputModality: "text"` or `outputModality: "audio"` to choose model output, returns `{}` and streams `thread/realtime/*` notifications. Omit `transport` for the websocket transport, or pass `{ "type": "webrtc", "sdp": "..." }` to create a WebRTC session from a browser-generated SDP offer; the remote answer SDP is emitted as `thread/realtime/sdp`. - `thread/realtime/appendAudio` — append an input audio chunk to the active realtime session (experimental); returns `{}`. - `thread/realtime/appendText` — append text input to the active realtime session (experimental); returns `{}`. - `thread/realtime/stop` — stop the active realtime session for the thread (experimental); returns `{}`. @@ -627,6 +627,7 @@ Then send `offer.sdp` to app-server. Core uses `experimental_realtime_ws_backend ```json { "method": "thread/realtime/start", "id": 40, "params": { "threadId": "thr_123", + "outputModality": "audio", "prompt": "You are on a call.", "sessionId": null, "transport": { "type": "webrtc", "sdp": "v=0\r\no=..." } @@ -950,7 +951,8 @@ The thread realtime API emits thread-scoped notifications for session lifecycle - `thread/realtime/started` — `{ threadId, sessionId }` once realtime starts for the thread (experimental). - `thread/realtime/itemAdded` — `{ threadId, item }` for raw non-audio realtime items that do not have a dedicated typed app-server notification, including `handoff_request` (experimental). `item` is forwarded as raw JSON while the upstream websocket item schema remains unstable. -- `thread/realtime/transcriptUpdated` — `{ threadId, role, text }` whenever realtime transcript text changes (experimental). This forwards the live transcript delta from that realtime event, not the full accumulated transcript. +- `thread/realtime/transcript/delta` — `{ threadId, role, delta }` for live realtime transcript deltas (experimental). +- `thread/realtime/transcript/done` — `{ threadId, role, text }` when realtime emits the final full text for a transcript part (experimental). - `thread/realtime/outputAudio/delta` — `{ threadId, audio }` for streamed output audio chunks (experimental). `audio` uses camelCase fields (`data`, `sampleRate`, `numChannels`, `samplesPerChannel`). - `thread/realtime/error` — `{ threadId, message }` when realtime encounters a transport or backend error (experimental). - `thread/realtime/closed` — `{ threadId, reason }` when the realtime transport closes (experimental). diff --git a/codex-rs/app-server/src/bespoke_event_handling.rs b/codex-rs/app-server/src/bespoke_event_handling.rs index 8d0d40ff945..b1202bda2ca 100644 --- a/codex-rs/app-server/src/bespoke_event_handling.rs +++ b/codex-rs/app-server/src/bespoke_event_handling.rs @@ -82,7 +82,8 @@ use codex_app_server_protocol::ThreadRealtimeItemAddedNotification; use codex_app_server_protocol::ThreadRealtimeOutputAudioDeltaNotification; use codex_app_server_protocol::ThreadRealtimeSdpNotification; use codex_app_server_protocol::ThreadRealtimeStartedNotification; -use codex_app_server_protocol::ThreadRealtimeTranscriptUpdatedNotification; +use codex_app_server_protocol::ThreadRealtimeTranscriptDeltaNotification; +use codex_app_server_protocol::ThreadRealtimeTranscriptDoneNotification; use codex_app_server_protocol::ThreadRollbackResponse; use codex_app_server_protocol::ThreadTokenUsage; use codex_app_server_protocol::ThreadTokenUsageUpdatedNotification; @@ -401,26 +402,50 @@ pub(crate) async fn apply_bespoke_event_handling( .await; } RealtimeEvent::InputTranscriptDelta(event) => { - let notification = ThreadRealtimeTranscriptUpdatedNotification { + let notification = ThreadRealtimeTranscriptDeltaNotification { thread_id: conversation_id.to_string(), role: "user".to_string(), - text: event.delta, + delta: event.delta, }; outgoing .send_server_notification( - ServerNotification::ThreadRealtimeTranscriptUpdated(notification), + ServerNotification::ThreadRealtimeTranscriptDelta(notification), + ) + .await; + } + RealtimeEvent::InputTranscriptDone(event) => { + let notification = ThreadRealtimeTranscriptDoneNotification { + thread_id: conversation_id.to_string(), + role: "user".to_string(), + text: event.text, + }; + outgoing + .send_server_notification( + ServerNotification::ThreadRealtimeTranscriptDone(notification), ) .await; } RealtimeEvent::OutputTranscriptDelta(event) => { - let notification = ThreadRealtimeTranscriptUpdatedNotification { + let notification = ThreadRealtimeTranscriptDeltaNotification { + thread_id: conversation_id.to_string(), + role: "assistant".to_string(), + delta: event.delta, + }; + outgoing + .send_server_notification( + ServerNotification::ThreadRealtimeTranscriptDelta(notification), + ) + .await; + } + RealtimeEvent::OutputTranscriptDone(event) => { + let notification = ThreadRealtimeTranscriptDoneNotification { thread_id: conversation_id.to_string(), role: "assistant".to_string(), - text: event.delta, + text: event.text, }; outgoing .send_server_notification( - ServerNotification::ThreadRealtimeTranscriptUpdated(notification), + ServerNotification::ThreadRealtimeTranscriptDone(notification), ) .await; } diff --git a/codex-rs/app-server/src/codex_message_processor.rs b/codex-rs/app-server/src/codex_message_processor.rs index d0b835280c5..0a953464f19 100644 --- a/codex-rs/app-server/src/codex_message_processor.rs +++ b/codex-rs/app-server/src/codex_message_processor.rs @@ -7218,6 +7218,7 @@ impl CodexMessageProcessor { &request_id, thread.as_ref(), Op::RealtimeConversationStart(ConversationStartParams { + output_modality: params.output_modality, prompt: params.prompt, session_id: params.session_id, transport: params.transport.map(|transport| match transport { diff --git a/codex-rs/app-server/tests/suite/v2/experimental_api.rs b/codex-rs/app-server/tests/suite/v2/experimental_api.rs index 25a607390ec..2fd457faf23 100644 --- a/codex-rs/app-server/tests/suite/v2/experimental_api.rs +++ b/codex-rs/app-server/tests/suite/v2/experimental_api.rs @@ -17,6 +17,7 @@ use codex_app_server_protocol::ThreadRealtimeStartParams; use codex_app_server_protocol::ThreadRealtimeStartTransport; use codex_app_server_protocol::ThreadStartParams; use codex_app_server_protocol::ThreadStartResponse; +use codex_protocol::protocol::RealtimeOutputModality; use pretty_assertions::assert_eq; use std::path::Path; use std::time::Duration; @@ -76,6 +77,7 @@ async fn realtime_conversation_start_requires_experimental_api_capability() -> R let request_id = mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { thread_id: "thr_123".to_string(), + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("hello".to_string())), session_id: None, transport: None, @@ -145,6 +147,7 @@ async fn realtime_webrtc_start_requires_experimental_api_capability() -> Result< let request_id = mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { thread_id: "thr_123".to_string(), + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("hello".to_string())), session_id: None, transport: Some(ThreadRealtimeStartTransport::Webrtc { diff --git a/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs b/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs index 59a815c144f..ab593381b22 100644 --- a/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs +++ b/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs @@ -31,7 +31,8 @@ use codex_app_server_protocol::ThreadRealtimeStartTransport; use codex_app_server_protocol::ThreadRealtimeStartedNotification; use codex_app_server_protocol::ThreadRealtimeStopParams; use codex_app_server_protocol::ThreadRealtimeStopResponse; -use codex_app_server_protocol::ThreadRealtimeTranscriptUpdatedNotification; +use codex_app_server_protocol::ThreadRealtimeTranscriptDeltaNotification; +use codex_app_server_protocol::ThreadRealtimeTranscriptDoneNotification; use codex_app_server_protocol::ThreadStartParams; use codex_app_server_protocol::ThreadStartResponse; use codex_app_server_protocol::TurnCompletedNotification; @@ -39,6 +40,7 @@ use codex_app_server_protocol::TurnStartedNotification; use codex_features::FEATURES; use codex_features::Feature; use codex_protocol::protocol::RealtimeConversationVersion; +use codex_protocol::protocol::RealtimeOutputModality; use codex_protocol::protocol::RealtimeVoice; use codex_protocol::protocol::RealtimeVoicesList; use core_test_support::responses; @@ -301,6 +303,7 @@ impl RealtimeE2eHarness { .mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { thread_id: self.thread_id.clone(), + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: Some(ThreadRealtimeStartTransport::Webrtc { @@ -478,6 +481,15 @@ async fn realtime_conversation_streams_v2_notifications() -> Result<()> { "type": "response.output_text.delta", "delta": "working" }), + json!({ + "type": "conversation.item.done", + "item": { + "id": "item_assistant_1", + "type": "message", + "role": "assistant", + "content": [{ "type": "output_text", "text": "working on it" }] + } + }), json!({ "type": "conversation.item.done", "item": { @@ -523,6 +535,7 @@ async fn realtime_conversation_streams_v2_notifications() -> Result<()> { let start_request_id = mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { thread_id: thread_start.thread.id.clone(), + output_modality: RealtimeOutputModality::Audio, prompt: None, session_id: None, transport: None, @@ -554,6 +567,10 @@ async fn realtime_conversation_streams_v2_notifications() -> Result<()> { startup_context_request.body_json()["session"]["audio"]["output"]["voice"], "cedar" ); + assert_eq!( + startup_context_request.body_json()["session"]["output_modalities"], + json!(["audio"]) + ); let startup_context_instructions = startup_context_request.body_json()["session"]["instructions"] .as_str() @@ -612,24 +629,32 @@ async fn realtime_conversation_streams_v2_notifications() -> Result<()> { assert_eq!(item_added.thread_id, output_audio.thread_id); assert_eq!(item_added.item["type"], json!("message")); - let first_transcript_update = read_notification::( + let first_transcript_delta = read_notification::( &mut mcp, - "thread/realtime/transcriptUpdated", + "thread/realtime/transcript/delta", ) .await?; - assert_eq!(first_transcript_update.thread_id, output_audio.thread_id); - assert_eq!(first_transcript_update.role, "user"); - assert_eq!(first_transcript_update.text, "delegate now"); - - let second_transcript_update = - read_notification::( - &mut mcp, - "thread/realtime/transcriptUpdated", - ) - .await?; - assert_eq!(second_transcript_update.thread_id, output_audio.thread_id); - assert_eq!(second_transcript_update.role, "assistant"); - assert_eq!(second_transcript_update.text, "working"); + assert_eq!(first_transcript_delta.thread_id, output_audio.thread_id); + assert_eq!(first_transcript_delta.role, "user"); + assert_eq!(first_transcript_delta.delta, "delegate now"); + + let second_transcript_delta = read_notification::( + &mut mcp, + "thread/realtime/transcript/delta", + ) + .await?; + assert_eq!(second_transcript_delta.thread_id, output_audio.thread_id); + assert_eq!(second_transcript_delta.role, "assistant"); + assert_eq!(second_transcript_delta.delta, "working"); + + let final_transcript_done = read_notification::( + &mut mcp, + "thread/realtime/transcript/done", + ) + .await?; + assert_eq!(final_transcript_done.thread_id, output_audio.thread_id); + assert_eq!(final_transcript_done.role, "assistant"); + assert_eq!(final_transcript_done.text, "working on it"); let handoff_item_added = read_notification::( &mut mcp, @@ -693,6 +718,140 @@ async fn realtime_conversation_streams_v2_notifications() -> Result<()> { Ok(()) } +#[tokio::test] +async fn realtime_text_output_modality_requests_text_output_and_final_transcript() -> Result<()> { + skip_if_no_network!(Ok(())); + + let responses_server = create_mock_responses_server_sequence_unchecked(Vec::new()).await; + let realtime_server = start_websocket_server(vec![vec![vec![ + json!({ + "type": "session.updated", + "session": { "id": "sess_text", "instructions": "backend prompt" } + }), + json!({ + "type": "response.output_text.delta", + "delta": "hello " + }), + json!({ + "type": "response.output_text.delta", + "delta": "world" + }), + json!({ + "type": "response.output_audio_transcript.done", + "transcript": "hello world" + }), + json!({ + "type": "conversation.item.done", + "item": { + "id": "item_output_1", + "type": "message", + "role": "assistant", + "content": [{"type": "output_text", "text": "hello world"}] + } + }), + ]]]) + .await; + + let codex_home = TempDir::new()?; + create_config_toml( + codex_home.path(), + &responses_server.uri(), + realtime_server.uri(), + /*realtime_enabled*/ true, + StartupContextConfig::Generated, + )?; + + let mut mcp = McpProcess::new(codex_home.path()).await?; + mcp.initialize().await?; + login_with_api_key(&mut mcp, "sk-test-key").await?; + + let thread_start_request_id = mcp + .send_thread_start_request(ThreadStartParams::default()) + .await?; + let thread_start_response: JSONRPCResponse = timeout( + DEFAULT_TIMEOUT, + mcp.read_stream_until_response_message(RequestId::Integer(thread_start_request_id)), + ) + .await??; + let thread_start: ThreadStartResponse = to_response(thread_start_response)?; + + let start_request_id = mcp + .send_thread_realtime_start_request(ThreadRealtimeStartParams { + thread_id: thread_start.thread.id.clone(), + output_modality: RealtimeOutputModality::Text, + prompt: None, + session_id: None, + transport: None, + voice: None, + }) + .await?; + let start_response: JSONRPCResponse = timeout( + DEFAULT_TIMEOUT, + mcp.read_stream_until_response_message(RequestId::Integer(start_request_id)), + ) + .await??; + let _: ThreadRealtimeStartResponse = to_response(start_response)?; + + let session_update = realtime_server + .wait_for_request(/*connection_index*/ 0, /*request_index*/ 0) + .await; + assert_eq!( + session_update.body_json()["session"]["output_modalities"], + json!(["text"]) + ); + + let first_delta = read_notification::( + &mut mcp, + "thread/realtime/transcript/delta", + ) + .await?; + let second_delta = read_notification::( + &mut mcp, + "thread/realtime/transcript/delta", + ) + .await?; + let done = read_notification::( + &mut mcp, + "thread/realtime/transcript/done", + ) + .await?; + assert_eq!( + vec![first_delta, second_delta], + vec![ + ThreadRealtimeTranscriptDeltaNotification { + thread_id: thread_start.thread.id.clone(), + role: "assistant".to_string(), + delta: "hello ".to_string(), + }, + ThreadRealtimeTranscriptDeltaNotification { + thread_id: thread_start.thread.id.clone(), + role: "assistant".to_string(), + delta: "world".to_string(), + }, + ] + ); + assert_eq!( + done, + ThreadRealtimeTranscriptDoneNotification { + thread_id: thread_start.thread.id, + role: "assistant".to_string(), + text: "hello world".to_string(), + } + ); + assert!( + timeout( + Duration::from_millis(200), + mcp.read_stream_until_notification_message("thread/realtime/transcript/done"), + ) + .await + .is_err(), + "should not emit duplicate transcript done from audio transcript done" + ); + + realtime_server.shutdown().await; + Ok(()) +} + #[tokio::test] async fn realtime_list_voices_returns_supported_names() -> Result<()> { let codex_home = TempDir::new()?; @@ -793,6 +952,7 @@ async fn realtime_conversation_stop_emits_closed_notification() -> Result<()> { let start_request_id = mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { thread_id: thread_start.thread.id.clone(), + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -889,6 +1049,7 @@ async fn realtime_webrtc_start_emits_sdp_notification() -> Result<()> { let start_request_id = mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { thread_id: thread_id.clone(), + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: Some(ThreadRealtimeStartTransport::Webrtc { @@ -1163,11 +1324,11 @@ async fn webrtc_v2_forwards_audio_and_text_between_client_and_sideband() -> Resu harness.append_text(thread_id, "hello").await?; let transcript = harness - .read_notification::( - "thread/realtime/transcriptUpdated", + .read_notification::( + "thread/realtime/transcript/delta", ) .await?; - assert_eq!(transcript.text, "transcribed audio"); + assert_eq!(transcript.delta, "transcribed audio"); let output_audio = harness .read_notification::( "thread/realtime/outputAudio/delta", @@ -1252,11 +1413,11 @@ async fn webrtc_v2_text_input_is_append_only_while_response_is_active() -> Resul "first", ); let transcript = harness - .read_notification::( - "thread/realtime/transcriptUpdated", + .read_notification::( + "thread/realtime/transcript/delta", ) .await?; - assert_eq!(transcript.text, "active response started"); + assert_eq!(transcript.delta, "active response started"); // Phase 3: send a second text turn while `resp_active` is still open. The // user message must reach realtime without requesting another response. @@ -1736,6 +1897,7 @@ async fn realtime_webrtc_start_surfaces_backend_error() -> Result<()> { let start_request_id = mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { thread_id: thread_start.thread.id, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: Some(ThreadRealtimeStartTransport::Webrtc { @@ -1794,6 +1956,7 @@ async fn realtime_conversation_requires_feature_flag() -> Result<()> { let start_request_id = mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { thread_id: thread_start.thread.id.clone(), + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, diff --git a/codex-rs/codex-api/src/endpoint/mod.rs b/codex-rs/codex-api/src/endpoint/mod.rs index 4a208317a9d..c16687ff281 100644 --- a/codex-rs/codex-api/src/endpoint/mod.rs +++ b/codex-rs/codex-api/src/endpoint/mod.rs @@ -13,6 +13,7 @@ pub use models::ModelsClient; pub use realtime_call::RealtimeCallClient; pub use realtime_call::RealtimeCallResponse; pub use realtime_websocket::RealtimeEventParser; +pub use realtime_websocket::RealtimeOutputModality; pub use realtime_websocket::RealtimeSessionConfig; pub use realtime_websocket::RealtimeSessionMode; pub use realtime_websocket::RealtimeWebsocketClient; diff --git a/codex-rs/codex-api/src/endpoint/realtime_call.rs b/codex-rs/codex-api/src/endpoint/realtime_call.rs index 8a68d088c7c..fbcdbf519ee 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_call.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_call.rs @@ -219,6 +219,7 @@ fn decode_call_id_from_location(headers: &HeaderMap) -> Result mod tests { use super::*; use crate::endpoint::realtime_websocket::RealtimeEventParser; + use crate::endpoint::realtime_websocket::RealtimeOutputModality; use crate::endpoint::realtime_websocket::RealtimeSessionMode; use crate::provider::RetryConfig; use async_trait::async_trait; @@ -309,6 +310,7 @@ mod tests { session_id: Some(session_id.to_string()), event_parser: RealtimeEventParser::RealtimeV2, session_mode: RealtimeSessionMode::Conversational, + output_modality: RealtimeOutputModality::Audio, voice: RealtimeVoice::Marin, } } diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs index a2681f4969b..b1fd1ef75cc 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs @@ -7,9 +7,9 @@ use crate::endpoint::realtime_websocket::protocol::RealtimeAudioFrame; use crate::endpoint::realtime_websocket::protocol::RealtimeEvent; use crate::endpoint::realtime_websocket::protocol::RealtimeEventParser; use crate::endpoint::realtime_websocket::protocol::RealtimeOutboundMessage; +use crate::endpoint::realtime_websocket::protocol::RealtimeOutputModality; use crate::endpoint::realtime_websocket::protocol::RealtimeSessionConfig; use crate::endpoint::realtime_websocket::protocol::RealtimeSessionMode; -use crate::endpoint::realtime_websocket::protocol::RealtimeTranscriptDelta; use crate::endpoint::realtime_websocket::protocol::RealtimeTranscriptEntry; use crate::endpoint::realtime_websocket::protocol::RealtimeVoice; use crate::endpoint::realtime_websocket::protocol::parse_realtime_event; @@ -17,6 +17,7 @@ use crate::error::ApiError; use crate::provider::Provider; use codex_client::backoff; use codex_client::maybe_build_rustls_client_config_with_custom_ca; +use codex_protocol::protocol::RealtimeTranscriptDelta; use codex_utils_rustls_provider::ensure_rustls_crypto_provider; use futures::SinkExt; use futures::StreamExt; @@ -307,10 +308,17 @@ impl RealtimeWebsocketWriter { &self, instructions: String, session_mode: RealtimeSessionMode, + output_modality: RealtimeOutputModality, voice: RealtimeVoice, ) -> Result<(), ApiError> { let session_mode = normalized_session_mode(self.event_parser, session_mode); - let session = session_update_session(self.event_parser, instructions, session_mode, voice); + let session = session_update_session( + self.event_parser, + instructions, + session_mode, + output_modality, + voice, + ); self.send_json(&RealtimeOutboundMessage::SessionUpdate { session }) .await } @@ -406,10 +414,10 @@ impl RealtimeWebsocketEvents { let mut active_transcript = self.active_transcript.lock().await; match event { RealtimeEvent::InputAudioSpeechStarted(_) => {} - RealtimeEvent::InputTranscriptDelta(RealtimeTranscriptDelta { delta }) => { + RealtimeEvent::InputTranscriptDelta(RealtimeTranscriptDelta { delta, .. }) => { append_transcript_delta(&mut active_transcript.entries, "user", delta); } - RealtimeEvent::OutputTranscriptDelta(RealtimeTranscriptDelta { delta }) => { + RealtimeEvent::OutputTranscriptDelta(RealtimeTranscriptDelta { delta, .. }) => { append_transcript_delta(&mut active_transcript.entries, "assistant", delta); } RealtimeEvent::HandoffRequested(handoff) => { @@ -418,6 +426,8 @@ impl RealtimeWebsocketEvents { } } RealtimeEvent::SessionUpdated { .. } + | RealtimeEvent::InputTranscriptDone(_) + | RealtimeEvent::OutputTranscriptDone(_) | RealtimeEvent::AudioOut(_) | RealtimeEvent::ResponseCreated(_) | RealtimeEvent::ResponseCancelled(_) @@ -581,7 +591,12 @@ impl RealtimeWebsocketClient { ); connection .writer - .send_session_update(config.instructions, config.session_mode, config.voice) + .send_session_update( + config.instructions, + config.session_mode, + config.output_modality, + config.voice, + ) .await?; Ok(connection) } @@ -721,13 +736,14 @@ fn normalize_realtime_path(url: &mut Url) { #[cfg(test)] mod tests { use super::*; - use crate::endpoint::realtime_websocket::protocol::RealtimeTranscriptDelta; use crate::endpoint::realtime_websocket::protocol::RealtimeTranscriptEntry; use codex_protocol::protocol::RealtimeHandoffRequested; use codex_protocol::protocol::RealtimeInputAudioSpeechStarted; use codex_protocol::protocol::RealtimeResponseCancelled; use codex_protocol::protocol::RealtimeResponseCreated; use codex_protocol::protocol::RealtimeResponseDone; + use codex_protocol::protocol::RealtimeTranscriptDelta; + use codex_protocol::protocol::RealtimeTranscriptDone; use codex_protocol::protocol::RealtimeVoice; use http::HeaderValue; use pretty_assertions::assert_eq; @@ -894,6 +910,8 @@ mod tests { fn parse_realtime_v2_input_audio_transcription_delta_event() { let payload = json!({ "type": "conversation.item.input_audio_transcription.delta", + "item_id": "item_input_1", + "content_index": 0, "delta": "hello" }) .to_string(); @@ -908,6 +926,32 @@ mod tests { ); } + #[test] + fn parse_realtime_v2_item_done_output_text_event() { + let payload = json!({ + "type": "conversation.item.done", + "item": { + "id": "item_output_1", + "type": "message", + "role": "assistant", + "content": [ + {"type": "output_text", "text": "hello"}, + {"type": "output_text", "text": " world"} + ] + } + }) + .to_string(); + + assert_eq!( + parse_realtime_event(payload.as_str(), RealtimeEventParser::RealtimeV2), + Some(RealtimeEvent::OutputTranscriptDone( + RealtimeTranscriptDone { + text: "hello world".to_string(), + } + )) + ); + } + #[test] fn parse_realtime_v2_output_audio_delta_defaults_audio_shape() { let payload = json!({ @@ -1374,6 +1418,7 @@ mod tests { session_id: Some("conv_1".to_string()), event_parser: RealtimeEventParser::V1, session_mode: RealtimeSessionMode::Conversational, + output_modality: RealtimeOutputModality::Audio, voice: RealtimeVoice::Breeze, }, HeaderMap::new(), @@ -1648,6 +1693,7 @@ mod tests { session_id: Some("conv_1".to_string()), event_parser: RealtimeEventParser::RealtimeV2, session_mode: RealtimeSessionMode::Conversational, + output_modality: RealtimeOutputModality::Audio, voice: RealtimeVoice::Cedar, }, HeaderMap::new(), @@ -1753,6 +1799,7 @@ mod tests { session_id: Some("conv_1".to_string()), event_parser: RealtimeEventParser::RealtimeV2, session_mode: RealtimeSessionMode::Transcription, + output_modality: RealtimeOutputModality::Audio, voice: RealtimeVoice::Marin, }, HeaderMap::new(), @@ -1856,6 +1903,7 @@ mod tests { session_id: Some("conv_1".to_string()), event_parser: RealtimeEventParser::V1, session_mode: RealtimeSessionMode::Transcription, + output_modality: RealtimeOutputModality::Audio, voice: RealtimeVoice::Cove, }, HeaderMap::new(), @@ -1945,6 +1993,7 @@ mod tests { session_id: Some("conv_1".to_string()), event_parser: RealtimeEventParser::V1, session_mode: RealtimeSessionMode::Conversational, + output_modality: RealtimeOutputModality::Audio, voice: RealtimeVoice::Cove, }, HeaderMap::new(), diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_common.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_common.rs index 8eb079fe83e..67345bdc7ea 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_common.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_common.rs @@ -8,6 +8,7 @@ use crate::endpoint::realtime_websocket::methods_v2::session_update_session as v use crate::endpoint::realtime_websocket::methods_v2::websocket_intent as v2_websocket_intent; use crate::endpoint::realtime_websocket::protocol::RealtimeEventParser; use crate::endpoint::realtime_websocket::protocol::RealtimeOutboundMessage; +use crate::endpoint::realtime_websocket::protocol::RealtimeOutputModality; use crate::endpoint::realtime_websocket::protocol::RealtimeSessionConfig; use crate::endpoint::realtime_websocket::protocol::RealtimeSessionMode; use crate::endpoint::realtime_websocket::protocol::RealtimeVoice; @@ -57,13 +58,14 @@ pub(super) fn session_update_session( event_parser: RealtimeEventParser, instructions: String, session_mode: RealtimeSessionMode, + output_modality: RealtimeOutputModality, voice: RealtimeVoice, ) -> SessionUpdateSession { let session_mode = normalized_session_mode(event_parser, session_mode); match event_parser { RealtimeEventParser::V1 => v1_session_update_session(instructions, voice), RealtimeEventParser::RealtimeV2 => { - v2_session_update_session(instructions, session_mode, voice) + v2_session_update_session(instructions, session_mode, output_modality, voice) } } } @@ -73,6 +75,7 @@ pub fn session_update_session_json(config: RealtimeSessionConfig) -> JsonResult< config.event_parser, config.instructions, config.session_mode, + config.output_modality, config.voice, ); session.id = config.session_id; diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_v2.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_v2.rs index c8881a7f066..f0f81d95ebf 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_v2.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_v2.rs @@ -9,6 +9,7 @@ use crate::endpoint::realtime_websocket::protocol::ConversationMessageItem; use crate::endpoint::realtime_websocket::protocol::ConversationRole; use crate::endpoint::realtime_websocket::protocol::NoiseReductionType; use crate::endpoint::realtime_websocket::protocol::RealtimeOutboundMessage; +use crate::endpoint::realtime_websocket::protocol::RealtimeOutputModality; use crate::endpoint::realtime_websocket::protocol::RealtimeSessionMode; use crate::endpoint::realtime_websocket::protocol::RealtimeVoice; use crate::endpoint::realtime_websocket::protocol::SessionAudio; @@ -26,6 +27,7 @@ use crate::endpoint::realtime_websocket::protocol::TurnDetectionType; use serde_json::json; const REALTIME_V2_OUTPUT_MODALITY_AUDIO: &str = "audio"; +const REALTIME_V2_OUTPUT_MODALITY_TEXT: &str = "text"; const REALTIME_V2_TOOL_CHOICE: &str = "auto"; const REALTIME_V2_BACKGROUND_AGENT_TOOL_NAME: &str = "background_agent"; const REALTIME_V2_BACKGROUND_AGENT_TOOL_DESCRIPTION: &str = "Send a user request to the background agent. Use this as the default action. Do not rephrase the user's ask or rewrite it in your own words; pass along the user's own words. If the background agent is idle, this starts a new task and returns the final result to the user. If the background agent is already working on a task, this sends the request as guidance to steer that previous task. If the user asks to do something next, later, after this, or once current work finishes, call this tool so the work is actually queued instead of merely promising to do it later."; @@ -59,6 +61,7 @@ pub(super) fn conversation_handoff_append_message( pub(super) fn session_update_session( instructions: String, session_mode: RealtimeSessionMode, + output_modality: RealtimeOutputModality, voice: RealtimeVoice, ) -> SessionUpdateSession { match session_mode { @@ -67,7 +70,7 @@ pub(super) fn session_update_session( r#type: SessionType::Realtime, model: None, instructions: Some(instructions), - output_modalities: Some(vec![REALTIME_V2_OUTPUT_MODALITY_AUDIO.to_string()]), + output_modalities: Some(vec![output_modality_value(output_modality).to_string()]), audio: SessionAudio { input: SessionAudioInput { format: SessionAudioFormat { @@ -132,6 +135,13 @@ pub(super) fn session_update_session( } } +fn output_modality_value(output_modality: RealtimeOutputModality) -> &'static str { + match output_modality { + RealtimeOutputModality::Text => REALTIME_V2_OUTPUT_MODALITY_TEXT, + RealtimeOutputModality::Audio => REALTIME_V2_OUTPUT_MODALITY_AUDIO, + } +} + pub(super) fn websocket_intent() -> Option<&'static str> { None } diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/mod.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/mod.rs index 4031e012860..1fb49b2436f 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/mod.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/mod.rs @@ -13,5 +13,6 @@ pub use methods::RealtimeWebsocketEvents; pub use methods::RealtimeWebsocketWriter; pub use methods_common::session_update_session_json; pub use protocol::RealtimeEventParser; +pub use protocol::RealtimeOutputModality; pub use protocol::RealtimeSessionConfig; pub use protocol::RealtimeSessionMode; diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol.rs index 0185984c613..0706ea24220 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol.rs @@ -2,7 +2,7 @@ use crate::endpoint::realtime_websocket::protocol_v1::parse_realtime_event_v1; use crate::endpoint::realtime_websocket::protocol_v2::parse_realtime_event_v2; pub use codex_protocol::protocol::RealtimeAudioFrame; pub use codex_protocol::protocol::RealtimeEvent; -pub use codex_protocol::protocol::RealtimeTranscriptDelta; +pub use codex_protocol::protocol::RealtimeOutputModality; pub use codex_protocol::protocol::RealtimeTranscriptEntry; pub use codex_protocol::protocol::RealtimeVoice; use serde::Serialize; @@ -27,6 +27,7 @@ pub struct RealtimeSessionConfig { pub session_id: Option, pub event_parser: RealtimeEventParser, pub session_mode: RealtimeSessionMode, + pub output_modality: RealtimeOutputModality, pub voice: RealtimeVoice, } diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_common.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_common.rs index dbd8544d94f..c89c5ea4d05 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_common.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_common.rs @@ -1,5 +1,6 @@ use codex_protocol::protocol::RealtimeEvent; use codex_protocol::protocol::RealtimeTranscriptDelta; +use codex_protocol::protocol::RealtimeTranscriptDone; use serde_json::Value; use tracing::debug; @@ -53,6 +54,17 @@ pub(super) fn parse_transcript_delta_event( .map(|delta| RealtimeTranscriptDelta { delta }) } +pub(super) fn parse_transcript_done_event( + parsed: &Value, + field: &str, +) -> Option { + parsed + .get(field) + .and_then(Value::as_str) + .map(str::to_string) + .map(|text| RealtimeTranscriptDone { text }) +} + pub(super) fn parse_error_event(parsed: &Value) -> Option { parsed .get("message") diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v2.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v2.rs index 4c2c909e802..559e83426bb 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v2.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v2.rs @@ -2,6 +2,7 @@ use crate::endpoint::realtime_websocket::protocol_common::parse_error_event; use crate::endpoint::realtime_websocket::protocol_common::parse_realtime_payload; use crate::endpoint::realtime_websocket::protocol_common::parse_session_updated_event; use crate::endpoint::realtime_websocket::protocol_common::parse_transcript_delta_event; +use crate::endpoint::realtime_websocket::protocol_common::parse_transcript_done_event; use codex_protocol::protocol::RealtimeAudioFrame; use codex_protocol::protocol::RealtimeEvent; use codex_protocol::protocol::RealtimeHandoffRequested; @@ -9,6 +10,7 @@ use codex_protocol::protocol::RealtimeInputAudioSpeechStarted; use codex_protocol::protocol::RealtimeResponseCancelled; use codex_protocol::protocol::RealtimeResponseCreated; use codex_protocol::protocol::RealtimeResponseDone; +use codex_protocol::protocol::RealtimeTranscriptDone; use serde_json::Map as JsonMap; use serde_json::Value; use tracing::debug; @@ -30,8 +32,8 @@ pub(super) fn parse_realtime_event_v2(payload: &str) -> Option { parse_transcript_delta_event(&parsed, "delta").map(RealtimeEvent::InputTranscriptDelta) } "conversation.item.input_audio_transcription.completed" => { - parse_transcript_delta_event(&parsed, "transcript") - .map(RealtimeEvent::InputTranscriptDelta) + parse_transcript_done_event(&parsed, "transcript") + .map(RealtimeEvent::InputTranscriptDone) } "response.output_text.delta" | "response.output_audio_transcript.delta" => { parse_transcript_delta_event(&parsed, "delta").map(RealtimeEvent::OutputTranscriptDelta) @@ -120,12 +122,43 @@ fn parse_conversation_item_done_event(parsed: &Value) -> Option { return Some(handoff); } + if let Some(transcript_done) = parse_item_done_transcript(item) { + return Some(transcript_done); + } + item.get("id") .and_then(Value::as_str) .map(str::to_string) .map(|item_id| RealtimeEvent::ConversationItemDone { item_id }) } +fn parse_item_done_transcript(item: &JsonMap) -> Option { + let role = item.get("role").and_then(Value::as_str)?; + let text = item + .get("content") + .and_then(Value::as_array)? + .iter() + .filter_map(item_content_text) + .collect::(); + if text.is_empty() { + return None; + } + + let done = RealtimeTranscriptDone { text }; + match role { + "user" => Some(RealtimeEvent::InputTranscriptDone(done)), + "assistant" => Some(RealtimeEvent::OutputTranscriptDone(done)), + _ => None, + } +} + +fn item_content_text(content: &Value) -> Option<&str> { + content + .get("text") + .or_else(|| content.get("transcript")) + .and_then(Value::as_str) +} + fn parse_handoff_requested_event(item: &JsonMap) -> Option { let item_type = item.get("type").and_then(Value::as_str); let item_name = item.get("name").and_then(Value::as_str); diff --git a/codex-rs/codex-api/src/lib.rs b/codex-rs/codex-api/src/lib.rs index ac26d3cdba7..f4f90b289cc 100644 --- a/codex-rs/codex-api/src/lib.rs +++ b/codex-rs/codex-api/src/lib.rs @@ -41,6 +41,7 @@ pub use crate::endpoint::ModelsClient; pub use crate::endpoint::RealtimeCallClient; pub use crate::endpoint::RealtimeCallResponse; pub use crate::endpoint::RealtimeEventParser; +pub use crate::endpoint::RealtimeOutputModality; pub use crate::endpoint::RealtimeSessionConfig; pub use crate::endpoint::RealtimeSessionMode; pub use crate::endpoint::RealtimeWebsocketClient; diff --git a/codex-rs/codex-api/tests/realtime_websocket_e2e.rs b/codex-rs/codex-api/tests/realtime_websocket_e2e.rs index 9969a96f097..abafaef2aed 100644 --- a/codex-rs/codex-api/tests/realtime_websocket_e2e.rs +++ b/codex-rs/codex-api/tests/realtime_websocket_e2e.rs @@ -6,6 +6,7 @@ use codex_api::Provider; use codex_api::RealtimeAudioFrame; use codex_api::RealtimeEvent; use codex_api::RealtimeEventParser; +use codex_api::RealtimeOutputModality; use codex_api::RealtimeSessionConfig; use codex_api::RealtimeSessionMode; use codex_api::RealtimeWebsocketClient; @@ -145,6 +146,7 @@ async fn realtime_ws_e2e_session_create_and_event_flow() { session_id: Some("conv_123".to_string()), event_parser: RealtimeEventParser::V1, session_mode: RealtimeSessionMode::Conversational, + output_modality: RealtimeOutputModality::Audio, voice: RealtimeVoice::Cove, }, HeaderMap::new(), @@ -248,6 +250,7 @@ async fn realtime_ws_connect_webrtc_sideband_retries_join_until_server_is_availa session_id: Some("conv_123".to_string()), event_parser: RealtimeEventParser::RealtimeV2, session_mode: RealtimeSessionMode::Conversational, + output_modality: RealtimeOutputModality::Audio, voice: RealtimeVoice::Marin, }, "rtc_test", @@ -319,6 +322,7 @@ async fn realtime_ws_e2e_send_while_next_event_waits() { session_id: Some("conv_123".to_string()), event_parser: RealtimeEventParser::V1, session_mode: RealtimeSessionMode::Conversational, + output_modality: RealtimeOutputModality::Audio, voice: RealtimeVoice::Cove, }, HeaderMap::new(), @@ -386,6 +390,7 @@ async fn realtime_ws_e2e_disconnected_emitted_once() { session_id: Some("conv_123".to_string()), event_parser: RealtimeEventParser::V1, session_mode: RealtimeSessionMode::Conversational, + output_modality: RealtimeOutputModality::Audio, voice: RealtimeVoice::Cove, }, HeaderMap::new(), @@ -449,6 +454,7 @@ async fn realtime_ws_e2e_ignores_unknown_text_events() { session_id: Some("conv_123".to_string()), event_parser: RealtimeEventParser::V1, session_mode: RealtimeSessionMode::Conversational, + output_modality: RealtimeOutputModality::Audio, voice: RealtimeVoice::Cove, }, HeaderMap::new(), @@ -515,6 +521,7 @@ async fn realtime_ws_e2e_realtime_v2_parser_emits_handoff_requested() { session_id: Some("conv_123".to_string()), event_parser: RealtimeEventParser::RealtimeV2, session_mode: RealtimeSessionMode::Conversational, + output_modality: RealtimeOutputModality::Audio, voice: RealtimeVoice::Marin, }, HeaderMap::new(), diff --git a/codex-rs/core/src/realtime_conversation.rs b/codex-rs/core/src/realtime_conversation.rs index 8200ba4908a..e749736710f 100644 --- a/codex-rs/core/src/realtime_conversation.rs +++ b/codex-rs/core/src/realtime_conversation.rs @@ -42,6 +42,7 @@ use codex_protocol::protocol::RealtimeConversationRealtimeEvent; use codex_protocol::protocol::RealtimeConversationSdpEvent; use codex_protocol::protocol::RealtimeConversationStartedEvent; use codex_protocol::protocol::RealtimeHandoffRequested; +use codex_protocol::protocol::RealtimeOutputModality; use codex_protocol::protocol::RealtimeVoice; use codex_protocol::protocol::RealtimeVoicesList; use http::HeaderMap; @@ -593,8 +594,14 @@ async fn prepare_realtime_start( api_provider.base_url = realtime_ws_base_url.clone(); } let version = config.realtime.version; - let session_config = - build_realtime_session_config(sess, params.prompt, params.session_id, params.voice).await?; + let session_config = build_realtime_session_config( + sess, + params.prompt, + params.session_id, + params.output_modality, + params.voice, + ) + .await?; let requested_session_id = session_config.session_id.clone(); let extra_headers = match transport { ConversationStartTransport::Websocket => { @@ -622,6 +629,7 @@ pub(crate) async fn build_realtime_session_config( sess: &Arc, prompt: Option>, session_id: Option, + output_modality: RealtimeOutputModality, voice: Option, ) -> CodexResult { let config = sess.get_config().await; @@ -653,6 +661,13 @@ pub(crate) async fn build_realtime_session_config( RealtimeWsVersion::V1 => RealtimeEventParser::V1, RealtimeWsVersion::V2 => RealtimeEventParser::RealtimeV2, }; + if config.realtime.version == RealtimeWsVersion::V1 + && matches!(output_modality, RealtimeOutputModality::Text) + { + return Err(CodexErr::InvalidRequest( + "text realtime output modality requires realtime v2".to_string(), + )); + } let session_mode = match config.realtime.session_type { RealtimeWsMode::Conversational => RealtimeSessionMode::Conversational, RealtimeWsMode::Transcription => RealtimeSessionMode::Transcription, @@ -667,6 +682,7 @@ pub(crate) async fn build_realtime_session_config( session_id: Some(session_id.unwrap_or_else(|| sess.conversation_id.to_string())), event_parser, session_mode, + output_modality, voice, }) } @@ -1216,7 +1232,9 @@ async fn handle_realtime_server_event( RealtimeEvent::Error(_) => true, RealtimeEvent::SessionUpdated { .. } | RealtimeEvent::InputTranscriptDelta(_) + | RealtimeEvent::InputTranscriptDone(_) | RealtimeEvent::OutputTranscriptDelta(_) + | RealtimeEvent::OutputTranscriptDone(_) | RealtimeEvent::ConversationItemAdded(_) | RealtimeEvent::ConversationItemDone { .. } => false, }; diff --git a/codex-rs/core/tests/suite/compact_remote.rs b/codex-rs/core/tests/suite/compact_remote.rs index a2681423f30..8322046d10f 100644 --- a/codex-rs/core/tests/suite/compact_remote.rs +++ b/codex-rs/core/tests/suite/compact_remote.rs @@ -17,6 +17,7 @@ use codex_protocol::protocol::ItemStartedEvent; use codex_protocol::protocol::Op; use codex_protocol::protocol::RealtimeConversationRealtimeEvent; use codex_protocol::protocol::RealtimeEvent; +use codex_protocol::protocol::RealtimeOutputModality; use codex_protocol::protocol::RolloutItem; use codex_protocol::protocol::RolloutLine; use codex_protocol::user_input::UserInput; @@ -116,6 +117,7 @@ async fn start_remote_realtime_server() -> responses::WebSocketTestServer { async fn start_realtime_conversation(codex: &codex_core::CodexThread) -> Result<()> { codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, diff --git a/codex-rs/core/tests/suite/realtime_conversation.rs b/codex-rs/core/tests/suite/realtime_conversation.rs index c4e2c758c68..174491f9d60 100644 --- a/codex-rs/core/tests/suite/realtime_conversation.rs +++ b/codex-rs/core/tests/suite/realtime_conversation.rs @@ -21,6 +21,7 @@ use codex_protocol::protocol::RealtimeAudioFrame; use codex_protocol::protocol::RealtimeConversationRealtimeEvent; use codex_protocol::protocol::RealtimeConversationVersion; use codex_protocol::protocol::RealtimeEvent; +use codex_protocol::protocol::RealtimeOutputModality; use codex_protocol::protocol::RealtimeVoice; use codex_protocol::protocol::RolloutItem; use codex_protocol::protocol::SessionSource; @@ -248,6 +249,7 @@ async fn conversation_start_audio_text_close_round_trip() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -381,6 +383,7 @@ async fn conversation_start_defaults_to_v2_and_gpt_realtime_1_5() -> Result<()> test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -464,6 +467,7 @@ async fn conversation_webrtc_start_posts_generated_session() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: Some(ConversationStartTransport::Webrtc { @@ -601,6 +605,7 @@ async fn conversation_start_uses_openai_env_key_fallback_with_chatgpt_auth() -> test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -662,6 +667,7 @@ async fn conversation_transport_close_emits_closed_event() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -747,6 +753,7 @@ async fn conversation_start_preflight_failure_emits_realtime_error_only() -> Res test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -790,6 +797,7 @@ async fn conversation_start_connect_failure_emits_realtime_error_only() -> Resul test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -880,6 +888,7 @@ async fn conversation_second_start_replaces_runtime() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("old".to_string())), session_id: Some("conv_old".to_string()), transport: None, @@ -898,6 +907,7 @@ async fn conversation_second_start_replaces_runtime() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("new".to_string())), session_id: Some("conv_new".to_string()), transport: None, @@ -987,6 +997,7 @@ async fn conversation_uses_experimental_realtime_ws_base_url_override() -> Resul test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -1044,6 +1055,7 @@ async fn conversation_uses_default_realtime_backend_prompt() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: None, session_id: None, transport: None, @@ -1109,6 +1121,7 @@ async fn conversation_uses_empty_instructions_for_null_or_empty_prompt() -> Resu ] { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt, session_id: None, transport: None, @@ -1167,6 +1180,7 @@ async fn conversation_uses_explicit_start_voice() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -1217,6 +1231,7 @@ async fn conversation_uses_configured_realtime_voice() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -1255,6 +1270,7 @@ async fn conversation_rejects_voice_for_wrong_realtime_version() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -1298,6 +1314,7 @@ async fn conversation_uses_experimental_realtime_ws_backend_prompt_override() -> test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("prompt from op".to_string())), session_id: None, transport: None, @@ -1363,6 +1380,7 @@ async fn conversation_uses_experimental_realtime_ws_startup_context_override() - test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("prompt from op".to_string())), session_id: None, transport: None, @@ -1426,6 +1444,7 @@ async fn conversation_disables_realtime_startup_context_with_empty_override() -> test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("prompt from op".to_string())), session_id: None, transport: None, @@ -1482,6 +1501,7 @@ async fn conversation_start_injects_startup_context_from_thread_history() -> Res test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -1593,6 +1613,7 @@ async fn conversation_startup_context_current_thread_selects_many_turns_by_budge codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -1697,6 +1718,7 @@ async fn conversation_startup_context_falls_back_to_workspace_map() -> Result<() test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -1751,6 +1773,7 @@ async fn conversation_startup_context_is_truncated_and_sent_once_per_start() -> test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -1826,6 +1849,7 @@ async fn conversation_user_text_turn_is_sent_to_realtime_when_active() -> Result test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -1948,6 +1972,7 @@ async fn conversation_user_text_turn_is_capped_when_mirrored_to_realtime() -> Re // active WebSocket session. test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -2075,6 +2100,7 @@ async fn conversation_mirrors_assistant_message_text_to_realtime_handoff() -> Re test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -2204,6 +2230,7 @@ async fn conversation_handoff_persists_across_item_done_until_turn_complete() -> test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -2348,6 +2375,7 @@ async fn inbound_handoff_request_starts_turn() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -2445,6 +2473,7 @@ async fn inbound_handoff_request_uses_active_transcript() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -2540,6 +2569,7 @@ async fn inbound_handoff_request_clears_active_transcript_after_each_handoff() - test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -2642,6 +2672,7 @@ async fn inbound_conversation_item_does_not_start_turn_and_still_forwards_audio( test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -2757,6 +2788,7 @@ async fn delegated_turn_user_role_echo_does_not_redelegate_and_still_forwards_au test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -2902,6 +2934,7 @@ async fn inbound_handoff_request_does_not_block_realtime_event_forwarding() -> R test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -3032,6 +3065,7 @@ async fn inbound_handoff_request_steers_active_turn() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -3183,6 +3217,7 @@ async fn inbound_handoff_request_starts_turn_and_does_not_block_realtime_audio() test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, diff --git a/codex-rs/protocol/src/protocol.rs b/codex-rs/protocol/src/protocol.rs index 1fc707469e0..7431e5f1230 100644 --- a/codex-rs/protocol/src/protocol.rs +++ b/codex-rs/protocol/src/protocol.rs @@ -134,6 +134,8 @@ pub struct McpServerRefreshConfig { #[derive(Debug, Clone, Deserialize, Serialize, PartialEq, JsonSchema, TS)] pub struct ConversationStartParams { + /// Selects whether the realtime session should produce text or audio output. + pub output_modality: RealtimeOutputModality, #[serde( default, deserialize_with = "conversation_start_prompt_serde::deserialize", @@ -157,6 +159,13 @@ pub enum ConversationStartTransport { Webrtc { sdp: String }, } +#[derive(Debug, Clone, Copy, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] +#[serde(rename_all = "snake_case")] +pub enum RealtimeOutputModality { + Text, + Audio, +} + mod conversation_start_prompt_serde { use serde::Deserializer; use serde::Serializer; @@ -290,6 +299,11 @@ pub struct RealtimeTranscriptDelta { pub delta: String, } +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] +pub struct RealtimeTranscriptDone { + pub text: String, +} + #[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] pub struct RealtimeTranscriptEntry { pub role: String, @@ -332,7 +346,9 @@ pub enum RealtimeEvent { }, InputAudioSpeechStarted(RealtimeInputAudioSpeechStarted), InputTranscriptDelta(RealtimeTranscriptDelta), + InputTranscriptDone(RealtimeTranscriptDone), OutputTranscriptDelta(RealtimeTranscriptDelta), + OutputTranscriptDone(RealtimeTranscriptDone), AudioOut(RealtimeAudioFrame), ResponseCreated(RealtimeResponseCreated), ResponseCancelled(RealtimeResponseCancelled), @@ -4586,12 +4602,14 @@ mod tests { }, }); let start = Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("be helpful".to_string())), session_id: Some("conv_1".to_string()), transport: None, voice: None, }); let webrtc_start = Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("be helpful".to_string())), session_id: Some("conv_1".to_string()), transport: Some(ConversationStartTransport::Webrtc { @@ -4604,12 +4622,14 @@ mod tests { }); let close = Op::RealtimeConversationClose; let default_prompt_start = Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: None, session_id: None, transport: None, voice: None, }); let null_prompt_start = Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(None), session_id: None, transport: None, @@ -4621,6 +4641,7 @@ mod tests { serde_json::to_value(&start).unwrap(), json!({ "type": "realtime_conversation_start", + "output_modality": "audio", "prompt": "be helpful", "session_id": "conv_1" }) @@ -4628,19 +4649,22 @@ mod tests { assert_eq!( serde_json::to_value(&default_prompt_start).unwrap(), json!({ - "type": "realtime_conversation_start" + "type": "realtime_conversation_start", + "output_modality": "audio" }) ); assert_eq!( serde_json::to_value(&null_prompt_start).unwrap(), json!({ "type": "realtime_conversation_start", + "output_modality": "audio", "prompt": null }) ); assert_eq!( serde_json::from_value::(json!({ - "type": "realtime_conversation_start" + "type": "realtime_conversation_start", + "output_modality": "audio" })) .unwrap(), default_prompt_start @@ -4648,6 +4672,7 @@ mod tests { assert_eq!( serde_json::from_value::(json!({ "type": "realtime_conversation_start", + "output_modality": "audio", "prompt": null })) .unwrap(), @@ -4693,6 +4718,7 @@ mod tests { serde_json::to_value(&webrtc_start).unwrap(), json!({ "type": "realtime_conversation_start", + "output_modality": "audio", "prompt": "be helpful", "session_id": "conv_1", "transport": { diff --git a/codex-rs/tui/src/app/app_server_adapter.rs b/codex-rs/tui/src/app/app_server_adapter.rs index 0bd78f353a9..b5eccbfc6d6 100644 --- a/codex-rs/tui/src/app/app_server_adapter.rs +++ b/codex-rs/tui/src/app/app_server_adapter.rs @@ -385,7 +385,10 @@ fn server_notification_thread_target( ServerNotification::ThreadRealtimeItemAdded(notification) => { Some(notification.thread_id.as_str()) } - ServerNotification::ThreadRealtimeTranscriptUpdated(notification) => { + ServerNotification::ThreadRealtimeTranscriptDelta(notification) => { + Some(notification.thread_id.as_str()) + } + ServerNotification::ThreadRealtimeTranscriptDone(notification) => { Some(notification.thread_id.as_str()) } ServerNotification::ThreadRealtimeOutputAudioDelta(notification) => { diff --git a/codex-rs/tui/src/app_server_session.rs b/codex-rs/tui/src/app_server_session.rs index 5f75bd390a6..1afa040d70f 100644 --- a/codex-rs/tui/src/app_server_session.rs +++ b/codex-rs/tui/src/app_server_session.rs @@ -657,6 +657,7 @@ impl AppServerSession { request_id, params: ThreadRealtimeStartParams { thread_id: thread_id.to_string(), + output_modality: params.output_modality, prompt: params.prompt, session_id: params.session_id, voice: params.voice, diff --git a/codex-rs/tui/src/chatwidget.rs b/codex-rs/tui/src/chatwidget.rs index efb4c041940..4d5275a25dc 100644 --- a/codex-rs/tui/src/chatwidget.rs +++ b/codex-rs/tui/src/chatwidget.rs @@ -6245,7 +6245,8 @@ impl ChatWidget { | ServerNotification::FsChanged(_) | ServerNotification::FuzzyFileSearchSessionUpdated(_) | ServerNotification::FuzzyFileSearchSessionCompleted(_) - | ServerNotification::ThreadRealtimeTranscriptUpdated(_) + | ServerNotification::ThreadRealtimeTranscriptDelta(_) + | ServerNotification::ThreadRealtimeTranscriptDone(_) | ServerNotification::WindowsWorldWritableWarning(_) | ServerNotification::WindowsSandboxSetupCompleted(_) | ServerNotification::AccountLoginCompleted(_) => {} diff --git a/codex-rs/tui/src/chatwidget/realtime.rs b/codex-rs/tui/src/chatwidget/realtime.rs index 03a59c224ca..6357361f8ee 100644 --- a/codex-rs/tui/src/chatwidget/realtime.rs +++ b/codex-rs/tui/src/chatwidget/realtime.rs @@ -7,6 +7,7 @@ use codex_protocol::protocol::RealtimeConversationClosedEvent; use codex_protocol::protocol::RealtimeConversationRealtimeEvent; use codex_protocol::protocol::RealtimeConversationStartedEvent; use codex_protocol::protocol::RealtimeEvent; +use codex_protocol::protocol::RealtimeOutputModality; use codex_realtime_webrtc::RealtimeWebrtcEvent; use codex_realtime_webrtc::RealtimeWebrtcSession; use codex_realtime_webrtc::RealtimeWebrtcSessionHandle; @@ -236,6 +237,7 @@ impl ChatWidget { ) { self.submit_op(AppCommand::realtime_conversation_start( ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: None, session_id: None, transport, @@ -327,7 +329,9 @@ impl ChatWidget { } RealtimeEvent::InputAudioSpeechStarted(_) => self.interrupt_realtime_audio_playback(), RealtimeEvent::InputTranscriptDelta(_) => {} + RealtimeEvent::InputTranscriptDone(_) => {} RealtimeEvent::OutputTranscriptDelta(_) => {} + RealtimeEvent::OutputTranscriptDone(_) => {} RealtimeEvent::AudioOut(frame) => self.enqueue_realtime_audio_out(&frame), RealtimeEvent::ResponseCreated(_) => {} RealtimeEvent::ResponseCancelled(_) => self.interrupt_realtime_audio_playback(),