From 39af7aca716e267271474a3962faecd0513ad728 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Mon, 13 Apr 2026 15:08:29 -0700 Subject: [PATCH 01/17] Add realtime connection mode to app-server protocol --- .../schema/json/ClientRequest.json | 7 + .../schema/json/ServerNotification.json | 16 +- .../codex_app_server_protocol.schemas.json | 23 ++- .../codex_app_server_protocol.v2.schemas.json | 23 ++- ...RealtimeTranscriptUpdatedNotification.json | 18 +- .../schema/typescript/RealtimeConnection.ts | 5 + .../RealtimeTranscriptUpdateKind.ts | 5 + .../schema/typescript/index.ts | 2 + ...adRealtimeTranscriptUpdatedNotification.ts | 11 +- .../src/protocol/common.rs | 13 +- .../app-server-protocol/src/protocol/v2.rs | 13 +- codex-rs/app-server/README.md | 5 +- .../app-server/src/bespoke_event_handling.rs | 2 + .../app-server/src/codex_message_processor.rs | 1 + .../tests/suite/v2/experimental_api.rs | 3 + .../tests/suite/v2/realtime_conversation.rs | 155 ++++++++++++++++++ codex-rs/codex-api/src/endpoint/mod.rs | 2 + .../codex-api/src/endpoint/realtime_call.rs | 2 + .../endpoint/realtime_websocket/methods.rs | 91 +++++++++- .../realtime_websocket/methods_common.rs | 5 +- .../endpoint/realtime_websocket/methods_v2.rs | 12 +- .../src/endpoint/realtime_websocket/mod.rs | 2 + .../endpoint/realtime_websocket/protocol.rs | 3 + .../realtime_websocket/protocol_common.rs | 11 +- .../realtime_websocket/protocol_v2.rs | 12 +- codex-rs/codex-api/src/lib.rs | 2 + .../codex-api/tests/realtime_websocket_e2e.rs | 7 + codex-rs/core/src/realtime_conversation.rs | 20 ++- codex-rs/core/tests/suite/compact_remote.rs | 2 + .../core/tests/suite/realtime_conversation.rs | 34 ++++ codex-rs/protocol/src/protocol.rs | 31 +++- codex-rs/tui/src/app_server_session.rs | 1 + codex-rs/tui/src/chatwidget/realtime.rs | 2 + 33 files changed, 510 insertions(+), 31 deletions(-) create mode 100644 codex-rs/app-server-protocol/schema/typescript/RealtimeConnection.ts create mode 100644 codex-rs/app-server-protocol/schema/typescript/RealtimeTranscriptUpdateKind.ts diff --git a/codex-rs/app-server-protocol/schema/json/ClientRequest.json b/codex-rs/app-server-protocol/schema/json/ClientRequest.json index a6044bba181..a0c9bc90b47 100644 --- a/codex-rs/app-server-protocol/schema/json/ClientRequest.json +++ b/codex-rs/app-server-protocol/schema/json/ClientRequest.json @@ -1499,6 +1499,13 @@ } ] }, + "RealtimeConnection": { + "enum": [ + "text", + "audio" + ], + "type": "string" + }, "RealtimeVoice": { "enum": [ "alloy", diff --git a/codex-rs/app-server-protocol/schema/json/ServerNotification.json b/codex-rs/app-server-protocol/schema/json/ServerNotification.json index c3ab83766a5..df92b988ea7 100644 --- a/codex-rs/app-server-protocol/schema/json/ServerNotification.json +++ b/codex-rs/app-server-protocol/schema/json/ServerNotification.json @@ -2111,6 +2111,13 @@ ], "type": "string" }, + "RealtimeTranscriptUpdateKind": { + "enum": [ + "delta", + "done" + ], + "type": "string" + }, "ReasoningEffort": { "description": "See https://platform.openai.com/docs/guides/reasoning?api-mode=responses#get-started-with-reasoning", "enum": [ @@ -3385,22 +3392,27 @@ "type": "object" }, "ThreadRealtimeTranscriptUpdatedNotification": { - "description": "EXPERIMENTAL - flat transcript delta emitted whenever realtime transcript text changes.", + "description": "EXPERIMENTAL - flat transcript update emitted whenever realtime transcript text changes or completes.", "properties": { "role": { "type": "string" }, "text": { + "description": "Delta text for delta updates; final complete text for done updates.", "type": "string" }, "threadId": { "type": "string" + }, + "updateKind": { + "$ref": "#/definitions/RealtimeTranscriptUpdateKind" } }, "required": [ "role", "text", - "threadId" + "threadId", + "updateKind" ], "type": "object" }, diff --git a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json index 1f207207015..30d4e3b92bb 100644 --- a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json +++ b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json @@ -10608,6 +10608,13 @@ } ] }, + "RealtimeConnection": { + "enum": [ + "text", + "audio" + ], + "type": "string" + }, "RealtimeConversationVersion": { "enum": [ "v1", @@ -10615,6 +10622,13 @@ ], "type": "string" }, + "RealtimeTranscriptUpdateKind": { + "enum": [ + "delta", + "done" + ], + "type": "string" + }, "RealtimeVoice": { "enum": [ "alloy", @@ -13963,22 +13977,27 @@ }, "ThreadRealtimeTranscriptUpdatedNotification": { "$schema": "http://json-schema.org/draft-07/schema#", - "description": "EXPERIMENTAL - flat transcript delta emitted whenever realtime transcript text changes.", + "description": "EXPERIMENTAL - flat transcript update emitted whenever realtime transcript text changes or completes.", "properties": { "role": { "type": "string" }, "text": { + "description": "Delta text for delta updates; final complete text for done updates.", "type": "string" }, "threadId": { "type": "string" + }, + "updateKind": { + "$ref": "#/definitions/v2/RealtimeTranscriptUpdateKind" } }, "required": [ "role", "text", - "threadId" + "threadId", + "updateKind" ], "title": "ThreadRealtimeTranscriptUpdatedNotification", "type": "object" diff --git a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json index d4d76de9f0e..d1ba7646c9a 100644 --- a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json +++ b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json @@ -7404,6 +7404,13 @@ } ] }, + "RealtimeConnection": { + "enum": [ + "text", + "audio" + ], + "type": "string" + }, "RealtimeConversationVersion": { "enum": [ "v1", @@ -7411,6 +7418,13 @@ ], "type": "string" }, + "RealtimeTranscriptUpdateKind": { + "enum": [ + "delta", + "done" + ], + "type": "string" + }, "RealtimeVoice": { "enum": [ "alloy", @@ -11811,22 +11825,27 @@ }, "ThreadRealtimeTranscriptUpdatedNotification": { "$schema": "http://json-schema.org/draft-07/schema#", - "description": "EXPERIMENTAL - flat transcript delta emitted whenever realtime transcript text changes.", + "description": "EXPERIMENTAL - flat transcript update emitted whenever realtime transcript text changes or completes.", "properties": { "role": { "type": "string" }, "text": { + "description": "Delta text for delta updates; final complete text for done updates.", "type": "string" }, "threadId": { "type": "string" + }, + "updateKind": { + "$ref": "#/definitions/RealtimeTranscriptUpdateKind" } }, "required": [ "role", "text", - "threadId" + "threadId", + "updateKind" ], "title": "ThreadRealtimeTranscriptUpdatedNotification", "type": "object" diff --git a/codex-rs/app-server-protocol/schema/json/v2/ThreadRealtimeTranscriptUpdatedNotification.json b/codex-rs/app-server-protocol/schema/json/v2/ThreadRealtimeTranscriptUpdatedNotification.json index 2c6860fa312..f936e581720 100644 --- a/codex-rs/app-server-protocol/schema/json/v2/ThreadRealtimeTranscriptUpdatedNotification.json +++ b/codex-rs/app-server-protocol/schema/json/v2/ThreadRealtimeTranscriptUpdatedNotification.json @@ -1,21 +1,35 @@ { "$schema": "http://json-schema.org/draft-07/schema#", - "description": "EXPERIMENTAL - flat transcript delta emitted whenever realtime transcript text changes.", + "definitions": { + "RealtimeTranscriptUpdateKind": { + "enum": [ + "delta", + "done" + ], + "type": "string" + } + }, + "description": "EXPERIMENTAL - flat transcript update emitted whenever realtime transcript text changes or completes.", "properties": { "role": { "type": "string" }, "text": { + "description": "Delta text for delta updates; final complete text for done updates.", "type": "string" }, "threadId": { "type": "string" + }, + "updateKind": { + "$ref": "#/definitions/RealtimeTranscriptUpdateKind" } }, "required": [ "role", "text", - "threadId" + "threadId", + "updateKind" ], "title": "ThreadRealtimeTranscriptUpdatedNotification", "type": "object" diff --git a/codex-rs/app-server-protocol/schema/typescript/RealtimeConnection.ts b/codex-rs/app-server-protocol/schema/typescript/RealtimeConnection.ts new file mode 100644 index 00000000000..bcd9e1b00ec --- /dev/null +++ b/codex-rs/app-server-protocol/schema/typescript/RealtimeConnection.ts @@ -0,0 +1,5 @@ +// GENERATED CODE! DO NOT MODIFY BY HAND! + +// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually. + +export type RealtimeConnection = "text" | "audio"; diff --git a/codex-rs/app-server-protocol/schema/typescript/RealtimeTranscriptUpdateKind.ts b/codex-rs/app-server-protocol/schema/typescript/RealtimeTranscriptUpdateKind.ts new file mode 100644 index 00000000000..1bc6d6ed8e7 --- /dev/null +++ b/codex-rs/app-server-protocol/schema/typescript/RealtimeTranscriptUpdateKind.ts @@ -0,0 +1,5 @@ +// GENERATED CODE! DO NOT MODIFY BY HAND! + +// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually. + +export type RealtimeTranscriptUpdateKind = "delta" | "done"; diff --git a/codex-rs/app-server-protocol/schema/typescript/index.ts b/codex-rs/app-server-protocol/schema/typescript/index.ts index 3f07f716958..d3d4c57e62d 100644 --- a/codex-rs/app-server-protocol/schema/typescript/index.ts +++ b/codex-rs/app-server-protocol/schema/typescript/index.ts @@ -48,7 +48,9 @@ export type { NetworkPolicyRuleAction } from "./NetworkPolicyRuleAction"; export type { ParsedCommand } from "./ParsedCommand"; export type { Personality } from "./Personality"; export type { PlanType } from "./PlanType"; +export type { RealtimeConnection } from "./RealtimeConnection"; export type { RealtimeConversationVersion } from "./RealtimeConversationVersion"; +export type { RealtimeTranscriptUpdateKind } from "./RealtimeTranscriptUpdateKind"; export type { RealtimeVoice } from "./RealtimeVoice"; export type { RealtimeVoicesList } from "./RealtimeVoicesList"; export type { ReasoningEffort } from "./ReasoningEffort"; diff --git a/codex-rs/app-server-protocol/schema/typescript/v2/ThreadRealtimeTranscriptUpdatedNotification.ts b/codex-rs/app-server-protocol/schema/typescript/v2/ThreadRealtimeTranscriptUpdatedNotification.ts index d2940029f2f..bf1b1e3439d 100644 --- a/codex-rs/app-server-protocol/schema/typescript/v2/ThreadRealtimeTranscriptUpdatedNotification.ts +++ b/codex-rs/app-server-protocol/schema/typescript/v2/ThreadRealtimeTranscriptUpdatedNotification.ts @@ -1,9 +1,14 @@ // GENERATED CODE! DO NOT MODIFY BY HAND! // This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually. +import type { RealtimeTranscriptUpdateKind } from "../RealtimeTranscriptUpdateKind"; /** - * EXPERIMENTAL - flat transcript delta emitted whenever realtime - * transcript text changes. + * EXPERIMENTAL - flat transcript update emitted whenever realtime + * transcript text changes or completes. */ -export type ThreadRealtimeTranscriptUpdatedNotification = { threadId: string, role: string, text: string, }; +export type ThreadRealtimeTranscriptUpdatedNotification = { threadId: string, role: string, +/** + * Delta text for delta updates; final complete text for done updates. + */ +text: string, updateKind: RealtimeTranscriptUpdateKind, }; diff --git a/codex-rs/app-server-protocol/src/protocol/common.rs b/codex-rs/app-server-protocol/src/protocol/common.rs index f26f8366f09..04ac2a5742d 100644 --- a/codex-rs/app-server-protocol/src/protocol/common.rs +++ b/codex-rs/app-server-protocol/src/protocol/common.rs @@ -1050,7 +1050,9 @@ mod tests { use codex_protocol::ThreadId; use codex_protocol::account::PlanType; use codex_protocol::parse_command::ParsedCommand; + use codex_protocol::protocol::RealtimeConnection; use codex_protocol::protocol::RealtimeConversationVersion; + use codex_protocol::protocol::RealtimeVoice; use codex_utils_absolute_path::AbsolutePathBuf; use pretty_assertions::assert_eq; use serde_json::json; @@ -1779,10 +1781,11 @@ mod tests { request_id: RequestId::Integer(9), params: v2::ThreadRealtimeStartParams { thread_id: "thr_123".to_string(), + connection: RealtimeConnection::Audio, prompt: Some(Some("You are on a call".to_string())), session_id: Some("sess_456".to_string()), transport: None, - voice: Some(codex_protocol::protocol::RealtimeVoice::Marin), + voice: Some(RealtimeVoice::Marin), }, }; assert_eq!( @@ -1791,6 +1794,7 @@ mod tests { "id": 9, "params": { "threadId": "thr_123", + "connection": "audio", "prompt": "You are on a call", "sessionId": "sess_456", "transport": null, @@ -1808,6 +1812,7 @@ mod tests { request_id: RequestId::Integer(9), params: v2::ThreadRealtimeStartParams { thread_id: "thr_123".to_string(), + connection: RealtimeConnection::Audio, prompt: None, session_id: None, transport: None, @@ -1820,6 +1825,7 @@ mod tests { "id": 9, "params": { "threadId": "thr_123", + "connection": "audio", "sessionId": null, "transport": null, "voice": null @@ -1832,6 +1838,7 @@ mod tests { request_id: RequestId::Integer(9), params: v2::ThreadRealtimeStartParams { thread_id: "thr_123".to_string(), + connection: RealtimeConnection::Audio, prompt: Some(None), session_id: None, transport: None, @@ -1844,6 +1851,7 @@ mod tests { "id": 9, "params": { "threadId": "thr_123", + "connection": "audio", "prompt": null, "sessionId": null, "transport": null, @@ -1858,6 +1866,7 @@ mod tests { "id": 9, "params": { "threadId": "thr_123", + "connection": "audio", "sessionId": null, "transport": null, "voice": null @@ -1873,6 +1882,7 @@ mod tests { "id": 9, "params": { "threadId": "thr_123", + "connection": "audio", "prompt": null, "sessionId": null, "transport": null, @@ -1957,6 +1967,7 @@ mod tests { request_id: RequestId::Integer(1), params: v2::ThreadRealtimeStartParams { thread_id: "thr_123".to_string(), + connection: RealtimeConnection::Audio, prompt: Some(Some("You are on a call".to_string())), session_id: None, transport: None, diff --git a/codex-rs/app-server-protocol/src/protocol/v2.rs b/codex-rs/app-server-protocol/src/protocol/v2.rs index 8a6a6e57b30..7bd8206f3fe 100644 --- a/codex-rs/app-server-protocol/src/protocol/v2.rs +++ b/codex-rs/app-server-protocol/src/protocol/v2.rs @@ -73,7 +73,9 @@ use codex_protocol::protocol::RateLimitSnapshot as CoreRateLimitSnapshot; use codex_protocol::protocol::RateLimitWindow as CoreRateLimitWindow; use codex_protocol::protocol::ReadOnlyAccess as CoreReadOnlyAccess; use codex_protocol::protocol::RealtimeAudioFrame as CoreRealtimeAudioFrame; +use codex_protocol::protocol::RealtimeConnection; use codex_protocol::protocol::RealtimeConversationVersion; +use codex_protocol::protocol::RealtimeTranscriptUpdateKind; use codex_protocol::protocol::RealtimeVoice; use codex_protocol::protocol::RealtimeVoicesList; use codex_protocol::protocol::ReviewDecision as CoreReviewDecision; @@ -3954,11 +3956,14 @@ impl From for CoreRealtimeAudioFrame { } /// EXPERIMENTAL - start a thread-scoped realtime session. -#[derive(Serialize, Deserialize, Debug, Default, Clone, PartialEq, JsonSchema, TS)] +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, JsonSchema, TS)] #[serde(rename_all = "camelCase")] #[ts(export_to = "v2/")] pub struct ThreadRealtimeStartParams { pub thread_id: String, + /// Selects text or audio output for the realtime session. Transport and voice stay + /// independent so clients can choose how they connect separately from what the model emits. + pub connection: RealtimeConnection, #[serde( default, deserialize_with = "super::serde_helpers::deserialize_double_option", @@ -4071,15 +4076,17 @@ pub struct ThreadRealtimeItemAddedNotification { pub item: JsonValue, } -/// EXPERIMENTAL - flat transcript delta emitted whenever realtime -/// transcript text changes. +/// EXPERIMENTAL - flat transcript update emitted whenever realtime +/// transcript text changes or completes. #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, JsonSchema, TS)] #[serde(rename_all = "camelCase")] #[ts(export_to = "v2/")] pub struct ThreadRealtimeTranscriptUpdatedNotification { pub thread_id: String, pub role: String, + /// Delta text for delta updates; final complete text for done updates. pub text: String, + pub update_kind: RealtimeTranscriptUpdateKind, } /// EXPERIMENTAL - streamed output audio emitted by thread realtime. diff --git a/codex-rs/app-server/README.md b/codex-rs/app-server/README.md index b0a16616aa2..f67ac7596c5 100644 --- a/codex-rs/app-server/README.md +++ b/codex-rs/app-server/README.md @@ -153,7 +153,7 @@ Example with notification opt-out: - `turn/start` — add user input to a thread and begin Codex generation; responds with the initial `turn` object and streams `turn/started`, `item/*`, and `turn/completed` notifications. For `collaborationMode`, `settings.developer_instructions: null` means "use built-in instructions for the selected mode". - `turn/steer` — add user input to an already in-flight regular turn without starting a new turn; returns the active `turnId` that accepted the input. Review and manual compaction turns reject `turn/steer`. - `turn/interrupt` — request cancellation of an in-flight turn by `(thread_id, turn_id)`; success is an empty `{}` response and the turn finishes with `status: "interrupted"`. -- `thread/realtime/start` — start a thread-scoped realtime session (experimental); returns `{}` and streams `thread/realtime/*` notifications. Omit `transport` for the websocket transport, or pass `{ "type": "webrtc", "sdp": "..." }` to create a WebRTC session from a browser-generated SDP offer; the remote answer SDP is emitted as `thread/realtime/sdp`. +- `thread/realtime/start` — start a thread-scoped realtime session (experimental); pass `connection: "text"` or `connection: "audio"` to choose model output, returns `{}` and streams `thread/realtime/*` notifications. Omit `transport` for the websocket transport, or pass `{ "type": "webrtc", "sdp": "..." }` to create a WebRTC session from a browser-generated SDP offer; the remote answer SDP is emitted as `thread/realtime/sdp`. - `thread/realtime/appendAudio` — append an input audio chunk to the active realtime session (experimental); returns `{}`. - `thread/realtime/appendText` — append text input to the active realtime session (experimental); returns `{}`. - `thread/realtime/stop` — stop the active realtime session for the thread (experimental); returns `{}`. @@ -608,6 +608,7 @@ Then send `offer.sdp` to app-server. Core uses `experimental_realtime_ws_backend ```json { "method": "thread/realtime/start", "id": 40, "params": { "threadId": "thr_123", + "connection": "audio", "prompt": "You are on a call.", "sessionId": null, "transport": { "type": "webrtc", "sdp": "v=0\r\no=..." } @@ -931,7 +932,7 @@ The thread realtime API emits thread-scoped notifications for session lifecycle - `thread/realtime/started` — `{ threadId, sessionId }` once realtime starts for the thread (experimental). - `thread/realtime/itemAdded` — `{ threadId, item }` for raw non-audio realtime items that do not have a dedicated typed app-server notification, including `handoff_request` (experimental). `item` is forwarded as raw JSON while the upstream websocket item schema remains unstable. -- `thread/realtime/transcriptUpdated` — `{ threadId, role, text }` whenever realtime transcript text changes (experimental). This forwards the live transcript delta from that realtime event, not the full accumulated transcript. +- `thread/realtime/transcriptUpdated` — `{ threadId, role, text, updateKind }` whenever realtime transcript text changes (experimental). `updateKind: "delta"` forwards the live transcript delta, and `updateKind: "done"` forwards the final full text for that output part. - `thread/realtime/outputAudio/delta` — `{ threadId, audio }` for streamed output audio chunks (experimental). `audio` uses camelCase fields (`data`, `sampleRate`, `numChannels`, `samplesPerChannel`). - `thread/realtime/error` — `{ threadId, message }` when realtime encounters a transport or backend error (experimental). - `thread/realtime/closed` — `{ threadId, reason }` when the realtime transport closes (experimental). diff --git a/codex-rs/app-server/src/bespoke_event_handling.rs b/codex-rs/app-server/src/bespoke_event_handling.rs index 8d0d40ff945..20c8b887ad9 100644 --- a/codex-rs/app-server/src/bespoke_event_handling.rs +++ b/codex-rs/app-server/src/bespoke_event_handling.rs @@ -405,6 +405,7 @@ pub(crate) async fn apply_bespoke_event_handling( thread_id: conversation_id.to_string(), role: "user".to_string(), text: event.delta, + update_kind: event.update_kind, }; outgoing .send_server_notification( @@ -417,6 +418,7 @@ pub(crate) async fn apply_bespoke_event_handling( thread_id: conversation_id.to_string(), role: "assistant".to_string(), text: event.delta, + update_kind: event.update_kind, }; outgoing .send_server_notification( diff --git a/codex-rs/app-server/src/codex_message_processor.rs b/codex-rs/app-server/src/codex_message_processor.rs index 3decc83f4a2..7ab7e42070c 100644 --- a/codex-rs/app-server/src/codex_message_processor.rs +++ b/codex-rs/app-server/src/codex_message_processor.rs @@ -7163,6 +7163,7 @@ impl CodexMessageProcessor { &request_id, thread.as_ref(), Op::RealtimeConversationStart(ConversationStartParams { + connection: params.connection, prompt: params.prompt, session_id: params.session_id, transport: params.transport.map(|transport| match transport { diff --git a/codex-rs/app-server/tests/suite/v2/experimental_api.rs b/codex-rs/app-server/tests/suite/v2/experimental_api.rs index 25a607390ec..4dc400984a1 100644 --- a/codex-rs/app-server/tests/suite/v2/experimental_api.rs +++ b/codex-rs/app-server/tests/suite/v2/experimental_api.rs @@ -17,6 +17,7 @@ use codex_app_server_protocol::ThreadRealtimeStartParams; use codex_app_server_protocol::ThreadRealtimeStartTransport; use codex_app_server_protocol::ThreadStartParams; use codex_app_server_protocol::ThreadStartResponse; +use codex_protocol::protocol::RealtimeConnection; use pretty_assertions::assert_eq; use std::path::Path; use std::time::Duration; @@ -76,6 +77,7 @@ async fn realtime_conversation_start_requires_experimental_api_capability() -> R let request_id = mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { thread_id: "thr_123".to_string(), + connection: RealtimeConnection::Audio, prompt: Some(Some("hello".to_string())), session_id: None, transport: None, @@ -145,6 +147,7 @@ async fn realtime_webrtc_start_requires_experimental_api_capability() -> Result< let request_id = mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { thread_id: "thr_123".to_string(), + connection: RealtimeConnection::Audio, prompt: Some(Some("hello".to_string())), session_id: None, transport: Some(ThreadRealtimeStartTransport::Webrtc { diff --git a/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs b/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs index f4c0f99ae31..13b3f339e86 100644 --- a/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs +++ b/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs @@ -38,7 +38,9 @@ use codex_app_server_protocol::TurnCompletedNotification; use codex_app_server_protocol::TurnStartedNotification; use codex_features::FEATURES; use codex_features::Feature; +use codex_protocol::protocol::RealtimeConnection; use codex_protocol::protocol::RealtimeConversationVersion; +use codex_protocol::protocol::RealtimeTranscriptUpdateKind; use codex_protocol::protocol::RealtimeVoice; use codex_protocol::protocol::RealtimeVoicesList; use core_test_support::responses; @@ -301,6 +303,7 @@ impl RealtimeE2eHarness { .mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { thread_id: self.thread_id.clone(), + connection: RealtimeConnection::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: Some(ThreadRealtimeStartTransport::Webrtc { @@ -478,6 +481,10 @@ async fn realtime_conversation_streams_v2_notifications() -> Result<()> { "type": "response.output_text.delta", "delta": "working" }), + json!({ + "type": "response.output_text.done", + "text": "working on it" + }), json!({ "type": "conversation.item.done", "item": { @@ -523,6 +530,7 @@ async fn realtime_conversation_streams_v2_notifications() -> Result<()> { let start_request_id = mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { thread_id: thread_start.thread.id.clone(), + connection: RealtimeConnection::Audio, prompt: None, session_id: None, transport: None, @@ -554,6 +562,10 @@ async fn realtime_conversation_streams_v2_notifications() -> Result<()> { startup_context_request.body_json()["session"]["audio"]["output"]["voice"], "cedar" ); + assert_eq!( + startup_context_request.body_json()["session"]["output_modalities"], + json!(["audio"]) + ); let startup_context_instructions = startup_context_request.body_json()["session"]["instructions"] .as_str() @@ -620,6 +632,10 @@ async fn realtime_conversation_streams_v2_notifications() -> Result<()> { assert_eq!(first_transcript_update.thread_id, output_audio.thread_id); assert_eq!(first_transcript_update.role, "user"); assert_eq!(first_transcript_update.text, "delegate now"); + assert_eq!( + first_transcript_update.update_kind, + RealtimeTranscriptUpdateKind::Delta + ); let second_transcript_update = read_notification::( @@ -630,6 +646,23 @@ async fn realtime_conversation_streams_v2_notifications() -> Result<()> { assert_eq!(second_transcript_update.thread_id, output_audio.thread_id); assert_eq!(second_transcript_update.role, "assistant"); assert_eq!(second_transcript_update.text, "working"); + assert_eq!( + second_transcript_update.update_kind, + RealtimeTranscriptUpdateKind::Delta + ); + + let final_transcript_update = read_notification::( + &mut mcp, + "thread/realtime/transcriptUpdated", + ) + .await?; + assert_eq!(final_transcript_update.thread_id, output_audio.thread_id); + assert_eq!(final_transcript_update.role, "assistant"); + assert_eq!(final_transcript_update.text, "working on it"); + assert_eq!( + final_transcript_update.update_kind, + RealtimeTranscriptUpdateKind::Done + ); let handoff_item_added = read_notification::( &mut mcp, @@ -693,6 +726,122 @@ async fn realtime_conversation_streams_v2_notifications() -> Result<()> { Ok(()) } +#[tokio::test] +async fn realtime_text_connection_requests_text_output_and_final_transcript() -> Result<()> { + skip_if_no_network!(Ok(())); + + let responses_server = create_mock_responses_server_sequence_unchecked(Vec::new()).await; + let realtime_server = start_websocket_server(vec![vec![vec![ + json!({ + "type": "session.updated", + "session": { "id": "sess_text", "instructions": "backend prompt" } + }), + json!({ + "type": "response.output_text.delta", + "delta": "hel" + }), + json!({ + "type": "response.output_text.delta", + "delta": "lo" + }), + json!({ + "type": "response.output_text.done", + "text": "hello" + }), + ]]]) + .await; + + let codex_home = TempDir::new()?; + create_config_toml( + codex_home.path(), + &responses_server.uri(), + realtime_server.uri(), + /*realtime_enabled*/ true, + StartupContextConfig::Generated, + )?; + + let mut mcp = McpProcess::new(codex_home.path()).await?; + mcp.initialize().await?; + login_with_api_key(&mut mcp, "sk-test-key").await?; + + let thread_start_request_id = mcp + .send_thread_start_request(ThreadStartParams::default()) + .await?; + let thread_start_response: JSONRPCResponse = timeout( + DEFAULT_TIMEOUT, + mcp.read_stream_until_response_message(RequestId::Integer(thread_start_request_id)), + ) + .await??; + let thread_start: ThreadStartResponse = to_response(thread_start_response)?; + + let start_request_id = mcp + .send_thread_realtime_start_request(ThreadRealtimeStartParams { + thread_id: thread_start.thread.id.clone(), + connection: RealtimeConnection::Text, + prompt: None, + session_id: None, + transport: None, + voice: None, + }) + .await?; + let start_response: JSONRPCResponse = timeout( + DEFAULT_TIMEOUT, + mcp.read_stream_until_response_message(RequestId::Integer(start_request_id)), + ) + .await??; + let _: ThreadRealtimeStartResponse = to_response(start_response)?; + + let session_update = realtime_server + .wait_for_request(/*connection_index*/ 0, /*request_index*/ 0) + .await; + assert_eq!( + session_update.body_json()["session"]["output_modalities"], + json!(["text"]) + ); + + let first_delta = read_notification::( + &mut mcp, + "thread/realtime/transcriptUpdated", + ) + .await?; + let second_delta = read_notification::( + &mut mcp, + "thread/realtime/transcriptUpdated", + ) + .await?; + let done = read_notification::( + &mut mcp, + "thread/realtime/transcriptUpdated", + ) + .await?; + assert_eq!( + vec![first_delta, second_delta, done], + vec![ + ThreadRealtimeTranscriptUpdatedNotification { + thread_id: thread_start.thread.id.clone(), + role: "assistant".to_string(), + text: "hel".to_string(), + update_kind: RealtimeTranscriptUpdateKind::Delta, + }, + ThreadRealtimeTranscriptUpdatedNotification { + thread_id: thread_start.thread.id.clone(), + role: "assistant".to_string(), + text: "lo".to_string(), + update_kind: RealtimeTranscriptUpdateKind::Delta, + }, + ThreadRealtimeTranscriptUpdatedNotification { + thread_id: thread_start.thread.id, + role: "assistant".to_string(), + text: "hello".to_string(), + update_kind: RealtimeTranscriptUpdateKind::Done, + }, + ] + ); + + realtime_server.shutdown().await; + Ok(()) +} + #[tokio::test] async fn realtime_list_voices_returns_supported_names() -> Result<()> { let codex_home = TempDir::new()?; @@ -793,6 +942,7 @@ async fn realtime_conversation_stop_emits_closed_notification() -> Result<()> { let start_request_id = mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { thread_id: thread_start.thread.id.clone(), + connection: RealtimeConnection::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -889,6 +1039,7 @@ async fn realtime_webrtc_start_emits_sdp_notification() -> Result<()> { let start_request_id = mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { thread_id: thread_id.clone(), + connection: RealtimeConnection::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: Some(ThreadRealtimeStartTransport::Webrtc { @@ -1168,6 +1319,7 @@ async fn webrtc_v2_forwards_audio_and_text_between_client_and_sideband() -> Resu ) .await?; assert_eq!(transcript.text, "transcribed audio"); + assert_eq!(transcript.update_kind, RealtimeTranscriptUpdateKind::Delta); let output_audio = harness .read_notification::( "thread/realtime/outputAudio/delta", @@ -1257,6 +1409,7 @@ async fn webrtc_v2_text_input_is_append_only_while_response_is_active() -> Resul ) .await?; assert_eq!(transcript.text, "active response started"); + assert_eq!(transcript.update_kind, RealtimeTranscriptUpdateKind::Delta); // Phase 3: send a second text turn while `resp_active` is still open. The // user message must reach realtime without requesting another response. @@ -1736,6 +1889,7 @@ async fn realtime_webrtc_start_surfaces_backend_error() -> Result<()> { let start_request_id = mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { thread_id: thread_start.thread.id, + connection: RealtimeConnection::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: Some(ThreadRealtimeStartTransport::Webrtc { @@ -1794,6 +1948,7 @@ async fn realtime_conversation_requires_feature_flag() -> Result<()> { let start_request_id = mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { thread_id: thread_start.thread.id.clone(), + connection: RealtimeConnection::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, diff --git a/codex-rs/codex-api/src/endpoint/mod.rs b/codex-rs/codex-api/src/endpoint/mod.rs index 4a208317a9d..815c3fbba15 100644 --- a/codex-rs/codex-api/src/endpoint/mod.rs +++ b/codex-rs/codex-api/src/endpoint/mod.rs @@ -12,9 +12,11 @@ pub use memories::MemoriesClient; pub use models::ModelsClient; pub use realtime_call::RealtimeCallClient; pub use realtime_call::RealtimeCallResponse; +pub use realtime_websocket::RealtimeConnection; pub use realtime_websocket::RealtimeEventParser; pub use realtime_websocket::RealtimeSessionConfig; pub use realtime_websocket::RealtimeSessionMode; +pub use realtime_websocket::RealtimeTranscriptUpdateKind; pub use realtime_websocket::RealtimeWebsocketClient; pub use realtime_websocket::RealtimeWebsocketConnection; pub use realtime_websocket::RealtimeWebsocketEvents; diff --git a/codex-rs/codex-api/src/endpoint/realtime_call.rs b/codex-rs/codex-api/src/endpoint/realtime_call.rs index 8a68d088c7c..1900c3443ee 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_call.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_call.rs @@ -218,6 +218,7 @@ fn decode_call_id_from_location(headers: &HeaderMap) -> Result #[cfg(test)] mod tests { use super::*; + use crate::endpoint::realtime_websocket::RealtimeConnection; use crate::endpoint::realtime_websocket::RealtimeEventParser; use crate::endpoint::realtime_websocket::RealtimeSessionMode; use crate::provider::RetryConfig; @@ -309,6 +310,7 @@ mod tests { session_id: Some(session_id.to_string()), event_parser: RealtimeEventParser::RealtimeV2, session_mode: RealtimeSessionMode::Conversational, + connection: RealtimeConnection::Audio, voice: RealtimeVoice::Marin, } } diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs index a2681f4969b..f4d3f0648ee 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs @@ -4,6 +4,7 @@ use crate::endpoint::realtime_websocket::methods_common::normalized_session_mode use crate::endpoint::realtime_websocket::methods_common::session_update_session; use crate::endpoint::realtime_websocket::methods_common::websocket_intent; use crate::endpoint::realtime_websocket::protocol::RealtimeAudioFrame; +use crate::endpoint::realtime_websocket::protocol::RealtimeConnection; use crate::endpoint::realtime_websocket::protocol::RealtimeEvent; use crate::endpoint::realtime_websocket::protocol::RealtimeEventParser; use crate::endpoint::realtime_websocket::protocol::RealtimeOutboundMessage; @@ -11,6 +12,7 @@ use crate::endpoint::realtime_websocket::protocol::RealtimeSessionConfig; use crate::endpoint::realtime_websocket::protocol::RealtimeSessionMode; use crate::endpoint::realtime_websocket::protocol::RealtimeTranscriptDelta; use crate::endpoint::realtime_websocket::protocol::RealtimeTranscriptEntry; +use crate::endpoint::realtime_websocket::protocol::RealtimeTranscriptUpdateKind; use crate::endpoint::realtime_websocket::protocol::RealtimeVoice; use crate::endpoint::realtime_websocket::protocol::parse_realtime_event; use crate::error::ApiError; @@ -307,10 +309,17 @@ impl RealtimeWebsocketWriter { &self, instructions: String, session_mode: RealtimeSessionMode, + connection: RealtimeConnection, voice: RealtimeVoice, ) -> Result<(), ApiError> { let session_mode = normalized_session_mode(self.event_parser, session_mode); - let session = session_update_session(self.event_parser, instructions, session_mode, voice); + let session = session_update_session( + self.event_parser, + instructions, + session_mode, + connection, + voice, + ); self.send_json(&RealtimeOutboundMessage::SessionUpdate { session }) .await } @@ -406,11 +415,11 @@ impl RealtimeWebsocketEvents { let mut active_transcript = self.active_transcript.lock().await; match event { RealtimeEvent::InputAudioSpeechStarted(_) => {} - RealtimeEvent::InputTranscriptDelta(RealtimeTranscriptDelta { delta }) => { - append_transcript_delta(&mut active_transcript.entries, "user", delta); + RealtimeEvent::InputTranscriptDelta(update) => { + update_active_transcript_entry(&mut active_transcript.entries, "user", update); } - RealtimeEvent::OutputTranscriptDelta(RealtimeTranscriptDelta { delta }) => { - append_transcript_delta(&mut active_transcript.entries, "assistant", delta); + RealtimeEvent::OutputTranscriptDelta(update) => { + update_active_transcript_entry(&mut active_transcript.entries, "assistant", update); } RealtimeEvent::HandoffRequested(handoff) => { if self.event_parser == RealtimeEventParser::V1 { @@ -447,6 +456,41 @@ fn append_transcript_delta(entries: &mut Vec, role: &st }); } +fn update_active_transcript_entry( + entries: &mut Vec, + role: &str, + update: &RealtimeTranscriptDelta, +) { + match update.update_kind { + RealtimeTranscriptUpdateKind::Delta => { + append_transcript_delta(entries, role, &update.delta) + } + RealtimeTranscriptUpdateKind::Done => { + complete_transcript_entry(entries, role, &update.delta) + } + } +} + +fn complete_transcript_entry(entries: &mut Vec, role: &str, text: &str) { + if text.is_empty() { + return; + } + + // Final transcript events carry the complete part. Replace the in-progress entry + // instead of appending, so active transcript state stays usable after deltas. + if let Some(last_entry) = entries.last_mut() + && last_entry.role == role + { + last_entry.text = text.to_string(); + return; + } + + entries.push(RealtimeTranscriptEntry { + role: role.to_string(), + text: text.to_string(), + }); +} + pub struct RealtimeWebsocketClient { provider: Provider, } @@ -581,7 +625,12 @@ impl RealtimeWebsocketClient { ); connection .writer - .send_session_update(config.instructions, config.session_mode, config.voice) + .send_session_update( + config.instructions, + config.session_mode, + config.connection, + config.voice, + ) .await?; Ok(connection) } @@ -842,6 +891,7 @@ mod tests { Some(RealtimeEvent::InputTranscriptDelta( RealtimeTranscriptDelta { delta: "hello ".to_string(), + update_kind: RealtimeTranscriptUpdateKind::Delta, } )) ); @@ -860,6 +910,7 @@ mod tests { Some(RealtimeEvent::OutputTranscriptDelta( RealtimeTranscriptDelta { delta: "hi".to_string(), + update_kind: RealtimeTranscriptUpdateKind::Delta, } )) ); @@ -903,6 +954,26 @@ mod tests { Some(RealtimeEvent::InputTranscriptDelta( RealtimeTranscriptDelta { delta: "hello".to_string(), + update_kind: RealtimeTranscriptUpdateKind::Delta, + } + )) + ); + } + + #[test] + fn parse_realtime_v2_output_text_done_event() { + let payload = json!({ + "type": "response.output_text.done", + "text": "all done" + }) + .to_string(); + + assert_eq!( + parse_realtime_event(payload.as_str(), RealtimeEventParser::RealtimeV2), + Some(RealtimeEvent::OutputTranscriptDelta( + RealtimeTranscriptDelta { + delta: "all done".to_string(), + update_kind: RealtimeTranscriptUpdateKind::Done, } )) ); @@ -1374,6 +1445,7 @@ mod tests { session_id: Some("conv_1".to_string()), event_parser: RealtimeEventParser::V1, session_mode: RealtimeSessionMode::Conversational, + connection: RealtimeConnection::Audio, voice: RealtimeVoice::Breeze, }, HeaderMap::new(), @@ -1442,6 +1514,7 @@ mod tests { input_delta_event, RealtimeEvent::InputTranscriptDelta(RealtimeTranscriptDelta { delta: "delegate ".to_string(), + update_kind: RealtimeTranscriptUpdateKind::Delta, }) ); @@ -1454,6 +1527,7 @@ mod tests { input_delta_event, RealtimeEvent::InputTranscriptDelta(RealtimeTranscriptDelta { delta: "now".to_string(), + update_kind: RealtimeTranscriptUpdateKind::Delta, }) ); @@ -1466,6 +1540,7 @@ mod tests { output_delta_event, RealtimeEvent::OutputTranscriptDelta(RealtimeTranscriptDelta { delta: "working".to_string(), + update_kind: RealtimeTranscriptUpdateKind::Delta, }) ); @@ -1648,6 +1723,7 @@ mod tests { session_id: Some("conv_1".to_string()), event_parser: RealtimeEventParser::RealtimeV2, session_mode: RealtimeSessionMode::Conversational, + connection: RealtimeConnection::Audio, voice: RealtimeVoice::Cedar, }, HeaderMap::new(), @@ -1753,6 +1829,7 @@ mod tests { session_id: Some("conv_1".to_string()), event_parser: RealtimeEventParser::RealtimeV2, session_mode: RealtimeSessionMode::Transcription, + connection: RealtimeConnection::Audio, voice: RealtimeVoice::Marin, }, HeaderMap::new(), @@ -1856,6 +1933,7 @@ mod tests { session_id: Some("conv_1".to_string()), event_parser: RealtimeEventParser::V1, session_mode: RealtimeSessionMode::Transcription, + connection: RealtimeConnection::Audio, voice: RealtimeVoice::Cove, }, HeaderMap::new(), @@ -1945,6 +2023,7 @@ mod tests { session_id: Some("conv_1".to_string()), event_parser: RealtimeEventParser::V1, session_mode: RealtimeSessionMode::Conversational, + connection: RealtimeConnection::Audio, voice: RealtimeVoice::Cove, }, HeaderMap::new(), diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_common.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_common.rs index 8eb079fe83e..fe4ca8d2cd8 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_common.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_common.rs @@ -6,6 +6,7 @@ use crate::endpoint::realtime_websocket::methods_v2::conversation_handoff_append use crate::endpoint::realtime_websocket::methods_v2::conversation_item_create_message as v2_conversation_item_create_message; use crate::endpoint::realtime_websocket::methods_v2::session_update_session as v2_session_update_session; use crate::endpoint::realtime_websocket::methods_v2::websocket_intent as v2_websocket_intent; +use crate::endpoint::realtime_websocket::protocol::RealtimeConnection; use crate::endpoint::realtime_websocket::protocol::RealtimeEventParser; use crate::endpoint::realtime_websocket::protocol::RealtimeOutboundMessage; use crate::endpoint::realtime_websocket::protocol::RealtimeSessionConfig; @@ -57,13 +58,14 @@ pub(super) fn session_update_session( event_parser: RealtimeEventParser, instructions: String, session_mode: RealtimeSessionMode, + connection: RealtimeConnection, voice: RealtimeVoice, ) -> SessionUpdateSession { let session_mode = normalized_session_mode(event_parser, session_mode); match event_parser { RealtimeEventParser::V1 => v1_session_update_session(instructions, voice), RealtimeEventParser::RealtimeV2 => { - v2_session_update_session(instructions, session_mode, voice) + v2_session_update_session(instructions, session_mode, connection, voice) } } } @@ -73,6 +75,7 @@ pub fn session_update_session_json(config: RealtimeSessionConfig) -> JsonResult< config.event_parser, config.instructions, config.session_mode, + config.connection, config.voice, ); session.id = config.session_id; diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_v2.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_v2.rs index cfca6fce613..4b2f0323c8d 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_v2.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_v2.rs @@ -8,6 +8,7 @@ use crate::endpoint::realtime_websocket::protocol::ConversationItemType; use crate::endpoint::realtime_websocket::protocol::ConversationMessageItem; use crate::endpoint::realtime_websocket::protocol::ConversationRole; use crate::endpoint::realtime_websocket::protocol::NoiseReductionType; +use crate::endpoint::realtime_websocket::protocol::RealtimeConnection; use crate::endpoint::realtime_websocket::protocol::RealtimeOutboundMessage; use crate::endpoint::realtime_websocket::protocol::RealtimeSessionMode; use crate::endpoint::realtime_websocket::protocol::RealtimeVoice; @@ -26,6 +27,7 @@ use crate::endpoint::realtime_websocket::protocol::TurnDetectionType; use serde_json::json; const REALTIME_V2_OUTPUT_MODALITY_AUDIO: &str = "audio"; +const REALTIME_V2_OUTPUT_MODALITY_TEXT: &str = "text"; const REALTIME_V2_TOOL_CHOICE: &str = "auto"; const REALTIME_V2_BACKGROUND_AGENT_TOOL_NAME: &str = "background_agent"; const REALTIME_V2_BACKGROUND_AGENT_TOOL_DESCRIPTION: &str = "Send a user request to the background agent. Use this as the default action. If the background agent is idle, this starts a new task and returns the final result to the user. If the background agent is already working on a task, this sends the request as guidance to steer that previous task. If the user asks to do something next, later, after this, or once current work finishes, call this tool so the work is actually queued instead of merely promising to do it later."; @@ -59,6 +61,7 @@ pub(super) fn conversation_handoff_append_message( pub(super) fn session_update_session( instructions: String, session_mode: RealtimeSessionMode, + connection: RealtimeConnection, voice: RealtimeVoice, ) -> SessionUpdateSession { match session_mode { @@ -67,7 +70,7 @@ pub(super) fn session_update_session( r#type: SessionType::Realtime, model: None, instructions: Some(instructions), - output_modalities: Some(vec![REALTIME_V2_OUTPUT_MODALITY_AUDIO.to_string()]), + output_modalities: Some(vec![output_modality(connection).to_string()]), audio: SessionAudio { input: SessionAudioInput { format: SessionAudioFormat { @@ -132,6 +135,13 @@ pub(super) fn session_update_session( } } +fn output_modality(connection: RealtimeConnection) -> &'static str { + match connection { + RealtimeConnection::Text => REALTIME_V2_OUTPUT_MODALITY_TEXT, + RealtimeConnection::Audio => REALTIME_V2_OUTPUT_MODALITY_AUDIO, + } +} + pub(super) fn websocket_intent() -> Option<&'static str> { None } diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/mod.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/mod.rs index 4031e012860..c93f6cf2efa 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/mod.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/mod.rs @@ -12,6 +12,8 @@ pub use methods::RealtimeWebsocketConnection; pub use methods::RealtimeWebsocketEvents; pub use methods::RealtimeWebsocketWriter; pub use methods_common::session_update_session_json; +pub use protocol::RealtimeConnection; pub use protocol::RealtimeEventParser; pub use protocol::RealtimeSessionConfig; pub use protocol::RealtimeSessionMode; +pub use protocol::RealtimeTranscriptUpdateKind; diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol.rs index 0185984c613..1aa7e1473a3 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol.rs @@ -1,9 +1,11 @@ use crate::endpoint::realtime_websocket::protocol_v1::parse_realtime_event_v1; use crate::endpoint::realtime_websocket::protocol_v2::parse_realtime_event_v2; pub use codex_protocol::protocol::RealtimeAudioFrame; +pub use codex_protocol::protocol::RealtimeConnection; pub use codex_protocol::protocol::RealtimeEvent; pub use codex_protocol::protocol::RealtimeTranscriptDelta; pub use codex_protocol::protocol::RealtimeTranscriptEntry; +pub use codex_protocol::protocol::RealtimeTranscriptUpdateKind; pub use codex_protocol::protocol::RealtimeVoice; use serde::Serialize; use serde_json::Value; @@ -27,6 +29,7 @@ pub struct RealtimeSessionConfig { pub session_id: Option, pub event_parser: RealtimeEventParser, pub session_mode: RealtimeSessionMode, + pub connection: RealtimeConnection, pub voice: RealtimeVoice, } diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_common.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_common.rs index dbd8544d94f..58639ee21f1 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_common.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_common.rs @@ -1,5 +1,6 @@ use codex_protocol::protocol::RealtimeEvent; use codex_protocol::protocol::RealtimeTranscriptDelta; +use codex_protocol::protocol::RealtimeTranscriptUpdateKind; use serde_json::Value; use tracing::debug; @@ -45,12 +46,20 @@ pub(super) fn parse_session_updated_event(parsed: &Value) -> Option Option { + parse_transcript_update_event(parsed, field, RealtimeTranscriptUpdateKind::Delta) +} + +pub(super) fn parse_transcript_update_event( + parsed: &Value, + field: &str, + update_kind: RealtimeTranscriptUpdateKind, ) -> Option { parsed .get(field) .and_then(Value::as_str) .map(str::to_string) - .map(|delta| RealtimeTranscriptDelta { delta }) + .map(|delta| RealtimeTranscriptDelta { delta, update_kind }) } pub(super) fn parse_error_event(parsed: &Value) -> Option { diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v2.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v2.rs index 4c2c909e802..d91d6c1d2fb 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v2.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v2.rs @@ -2,6 +2,7 @@ use crate::endpoint::realtime_websocket::protocol_common::parse_error_event; use crate::endpoint::realtime_websocket::protocol_common::parse_realtime_payload; use crate::endpoint::realtime_websocket::protocol_common::parse_session_updated_event; use crate::endpoint::realtime_websocket::protocol_common::parse_transcript_delta_event; +use crate::endpoint::realtime_websocket::protocol_common::parse_transcript_update_event; use codex_protocol::protocol::RealtimeAudioFrame; use codex_protocol::protocol::RealtimeEvent; use codex_protocol::protocol::RealtimeHandoffRequested; @@ -9,6 +10,7 @@ use codex_protocol::protocol::RealtimeInputAudioSpeechStarted; use codex_protocol::protocol::RealtimeResponseCancelled; use codex_protocol::protocol::RealtimeResponseCreated; use codex_protocol::protocol::RealtimeResponseDone; +use codex_protocol::protocol::RealtimeTranscriptUpdateKind; use serde_json::Map as JsonMap; use serde_json::Value; use tracing::debug; @@ -30,12 +32,20 @@ pub(super) fn parse_realtime_event_v2(payload: &str) -> Option { parse_transcript_delta_event(&parsed, "delta").map(RealtimeEvent::InputTranscriptDelta) } "conversation.item.input_audio_transcription.completed" => { - parse_transcript_delta_event(&parsed, "transcript") + parse_transcript_update_event(&parsed, "transcript", RealtimeTranscriptUpdateKind::Done) .map(RealtimeEvent::InputTranscriptDelta) } "response.output_text.delta" | "response.output_audio_transcript.delta" => { parse_transcript_delta_event(&parsed, "delta").map(RealtimeEvent::OutputTranscriptDelta) } + "response.output_text.done" => { + parse_transcript_update_event(&parsed, "text", RealtimeTranscriptUpdateKind::Done) + .map(RealtimeEvent::OutputTranscriptDelta) + } + "response.output_audio_transcript.done" => { + parse_transcript_update_event(&parsed, "transcript", RealtimeTranscriptUpdateKind::Done) + .map(RealtimeEvent::OutputTranscriptDelta) + } "input_audio_buffer.speech_started" => Some(RealtimeEvent::InputAudioSpeechStarted( RealtimeInputAudioSpeechStarted { item_id: parsed diff --git a/codex-rs/codex-api/src/lib.rs b/codex-rs/codex-api/src/lib.rs index ac26d3cdba7..bd592bfceed 100644 --- a/codex-rs/codex-api/src/lib.rs +++ b/codex-rs/codex-api/src/lib.rs @@ -40,9 +40,11 @@ pub use crate::endpoint::MemoriesClient; pub use crate::endpoint::ModelsClient; pub use crate::endpoint::RealtimeCallClient; pub use crate::endpoint::RealtimeCallResponse; +pub use crate::endpoint::RealtimeConnection; pub use crate::endpoint::RealtimeEventParser; pub use crate::endpoint::RealtimeSessionConfig; pub use crate::endpoint::RealtimeSessionMode; +pub use crate::endpoint::RealtimeTranscriptUpdateKind; pub use crate::endpoint::RealtimeWebsocketClient; pub use crate::endpoint::RealtimeWebsocketConnection; pub use crate::endpoint::RealtimeWebsocketEvents; diff --git a/codex-rs/codex-api/tests/realtime_websocket_e2e.rs b/codex-rs/codex-api/tests/realtime_websocket_e2e.rs index 9969a96f097..c03743d776f 100644 --- a/codex-rs/codex-api/tests/realtime_websocket_e2e.rs +++ b/codex-rs/codex-api/tests/realtime_websocket_e2e.rs @@ -4,6 +4,7 @@ use std::time::Duration; use codex_api::Provider; use codex_api::RealtimeAudioFrame; +use codex_api::RealtimeConnection; use codex_api::RealtimeEvent; use codex_api::RealtimeEventParser; use codex_api::RealtimeSessionConfig; @@ -145,6 +146,7 @@ async fn realtime_ws_e2e_session_create_and_event_flow() { session_id: Some("conv_123".to_string()), event_parser: RealtimeEventParser::V1, session_mode: RealtimeSessionMode::Conversational, + connection: RealtimeConnection::Audio, voice: RealtimeVoice::Cove, }, HeaderMap::new(), @@ -248,6 +250,7 @@ async fn realtime_ws_connect_webrtc_sideband_retries_join_until_server_is_availa session_id: Some("conv_123".to_string()), event_parser: RealtimeEventParser::RealtimeV2, session_mode: RealtimeSessionMode::Conversational, + connection: RealtimeConnection::Audio, voice: RealtimeVoice::Marin, }, "rtc_test", @@ -319,6 +322,7 @@ async fn realtime_ws_e2e_send_while_next_event_waits() { session_id: Some("conv_123".to_string()), event_parser: RealtimeEventParser::V1, session_mode: RealtimeSessionMode::Conversational, + connection: RealtimeConnection::Audio, voice: RealtimeVoice::Cove, }, HeaderMap::new(), @@ -386,6 +390,7 @@ async fn realtime_ws_e2e_disconnected_emitted_once() { session_id: Some("conv_123".to_string()), event_parser: RealtimeEventParser::V1, session_mode: RealtimeSessionMode::Conversational, + connection: RealtimeConnection::Audio, voice: RealtimeVoice::Cove, }, HeaderMap::new(), @@ -449,6 +454,7 @@ async fn realtime_ws_e2e_ignores_unknown_text_events() { session_id: Some("conv_123".to_string()), event_parser: RealtimeEventParser::V1, session_mode: RealtimeSessionMode::Conversational, + connection: RealtimeConnection::Audio, voice: RealtimeVoice::Cove, }, HeaderMap::new(), @@ -515,6 +521,7 @@ async fn realtime_ws_e2e_realtime_v2_parser_emits_handoff_requested() { session_id: Some("conv_123".to_string()), event_parser: RealtimeEventParser::RealtimeV2, session_mode: RealtimeSessionMode::Conversational, + connection: RealtimeConnection::Audio, voice: RealtimeVoice::Marin, }, HeaderMap::new(), diff --git a/codex-rs/core/src/realtime_conversation.rs b/codex-rs/core/src/realtime_conversation.rs index 8200ba4908a..f1291ef333a 100644 --- a/codex-rs/core/src/realtime_conversation.rs +++ b/codex-rs/core/src/realtime_conversation.rs @@ -37,6 +37,7 @@ use codex_protocol::protocol::ConversationTextParams; use codex_protocol::protocol::ErrorEvent; use codex_protocol::protocol::Event; use codex_protocol::protocol::EventMsg; +use codex_protocol::protocol::RealtimeConnection; use codex_protocol::protocol::RealtimeConversationClosedEvent; use codex_protocol::protocol::RealtimeConversationRealtimeEvent; use codex_protocol::protocol::RealtimeConversationSdpEvent; @@ -593,8 +594,14 @@ async fn prepare_realtime_start( api_provider.base_url = realtime_ws_base_url.clone(); } let version = config.realtime.version; - let session_config = - build_realtime_session_config(sess, params.prompt, params.session_id, params.voice).await?; + let session_config = build_realtime_session_config( + sess, + params.prompt, + params.session_id, + params.connection, + params.voice, + ) + .await?; let requested_session_id = session_config.session_id.clone(); let extra_headers = match transport { ConversationStartTransport::Websocket => { @@ -622,6 +629,7 @@ pub(crate) async fn build_realtime_session_config( sess: &Arc, prompt: Option>, session_id: Option, + connection: RealtimeConnection, voice: Option, ) -> CodexResult { let config = sess.get_config().await; @@ -653,6 +661,13 @@ pub(crate) async fn build_realtime_session_config( RealtimeWsVersion::V1 => RealtimeEventParser::V1, RealtimeWsVersion::V2 => RealtimeEventParser::RealtimeV2, }; + if config.realtime.version == RealtimeWsVersion::V1 + && matches!(connection, RealtimeConnection::Text) + { + return Err(CodexErr::InvalidRequest( + "text realtime connection requires realtime v2".to_string(), + )); + } let session_mode = match config.realtime.session_type { RealtimeWsMode::Conversational => RealtimeSessionMode::Conversational, RealtimeWsMode::Transcription => RealtimeSessionMode::Transcription, @@ -667,6 +682,7 @@ pub(crate) async fn build_realtime_session_config( session_id: Some(session_id.unwrap_or_else(|| sess.conversation_id.to_string())), event_parser, session_mode, + connection, voice, }) } diff --git a/codex-rs/core/tests/suite/compact_remote.rs b/codex-rs/core/tests/suite/compact_remote.rs index a2681423f30..9e32b4c1a9a 100644 --- a/codex-rs/core/tests/suite/compact_remote.rs +++ b/codex-rs/core/tests/suite/compact_remote.rs @@ -15,6 +15,7 @@ use codex_protocol::protocol::EventMsg; use codex_protocol::protocol::ItemCompletedEvent; use codex_protocol::protocol::ItemStartedEvent; use codex_protocol::protocol::Op; +use codex_protocol::protocol::RealtimeConnection; use codex_protocol::protocol::RealtimeConversationRealtimeEvent; use codex_protocol::protocol::RealtimeEvent; use codex_protocol::protocol::RolloutItem; @@ -116,6 +117,7 @@ async fn start_remote_realtime_server() -> responses::WebSocketTestServer { async fn start_realtime_conversation(codex: &codex_core::CodexThread) -> Result<()> { codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, diff --git a/codex-rs/core/tests/suite/realtime_conversation.rs b/codex-rs/core/tests/suite/realtime_conversation.rs index cede36c28a0..fb59578586b 100644 --- a/codex-rs/core/tests/suite/realtime_conversation.rs +++ b/codex-rs/core/tests/suite/realtime_conversation.rs @@ -18,6 +18,7 @@ use codex_protocol::protocol::EventMsg; use codex_protocol::protocol::InitialHistory; use codex_protocol::protocol::Op; use codex_protocol::protocol::RealtimeAudioFrame; +use codex_protocol::protocol::RealtimeConnection; use codex_protocol::protocol::RealtimeConversationRealtimeEvent; use codex_protocol::protocol::RealtimeConversationVersion; use codex_protocol::protocol::RealtimeEvent; @@ -247,6 +248,7 @@ async fn conversation_start_audio_text_close_round_trip() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -380,6 +382,7 @@ async fn conversation_start_defaults_to_v2_and_gpt_realtime_1_5() -> Result<()> test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -463,6 +466,7 @@ async fn conversation_webrtc_start_posts_generated_session() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: Some(ConversationStartTransport::Webrtc { @@ -600,6 +604,7 @@ async fn conversation_start_uses_openai_env_key_fallback_with_chatgpt_auth() -> test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -661,6 +666,7 @@ async fn conversation_transport_close_emits_closed_event() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -746,6 +752,7 @@ async fn conversation_start_preflight_failure_emits_realtime_error_only() -> Res test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -789,6 +796,7 @@ async fn conversation_start_connect_failure_emits_realtime_error_only() -> Resul test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -879,6 +887,7 @@ async fn conversation_second_start_replaces_runtime() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(Some("old".to_string())), session_id: Some("conv_old".to_string()), transport: None, @@ -897,6 +906,7 @@ async fn conversation_second_start_replaces_runtime() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(Some("new".to_string())), session_id: Some("conv_new".to_string()), transport: None, @@ -986,6 +996,7 @@ async fn conversation_uses_experimental_realtime_ws_base_url_override() -> Resul test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -1043,6 +1054,7 @@ async fn conversation_uses_default_realtime_backend_prompt() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: None, session_id: None, transport: None, @@ -1108,6 +1120,7 @@ async fn conversation_uses_empty_instructions_for_null_or_empty_prompt() -> Resu ] { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt, session_id: None, transport: None, @@ -1166,6 +1179,7 @@ async fn conversation_uses_explicit_start_voice() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -1216,6 +1230,7 @@ async fn conversation_uses_configured_realtime_voice() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -1254,6 +1269,7 @@ async fn conversation_rejects_voice_for_wrong_realtime_version() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -1297,6 +1313,7 @@ async fn conversation_uses_experimental_realtime_ws_backend_prompt_override() -> test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(Some("prompt from op".to_string())), session_id: None, transport: None, @@ -1362,6 +1379,7 @@ async fn conversation_uses_experimental_realtime_ws_startup_context_override() - test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(Some("prompt from op".to_string())), session_id: None, transport: None, @@ -1425,6 +1443,7 @@ async fn conversation_disables_realtime_startup_context_with_empty_override() -> test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(Some("prompt from op".to_string())), session_id: None, transport: None, @@ -1481,6 +1500,7 @@ async fn conversation_start_injects_startup_context_from_thread_history() -> Res test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -1592,6 +1612,7 @@ async fn conversation_startup_context_current_thread_selects_many_turns_by_budge codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -1696,6 +1717,7 @@ async fn conversation_startup_context_falls_back_to_workspace_map() -> Result<() test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -1750,6 +1772,7 @@ async fn conversation_startup_context_is_truncated_and_sent_once_per_start() -> test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -1825,6 +1848,7 @@ async fn conversation_user_text_turn_is_sent_to_realtime_when_active() -> Result test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -1957,6 +1981,7 @@ async fn conversation_mirrors_assistant_message_text_to_realtime_handoff() -> Re test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -2086,6 +2111,7 @@ async fn conversation_handoff_persists_across_item_done_until_turn_complete() -> test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -2230,6 +2256,7 @@ async fn inbound_handoff_request_starts_turn() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -2327,6 +2354,7 @@ async fn inbound_handoff_request_uses_active_transcript() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -2422,6 +2450,7 @@ async fn inbound_handoff_request_clears_active_transcript_after_each_handoff() - test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -2524,6 +2553,7 @@ async fn inbound_conversation_item_does_not_start_turn_and_still_forwards_audio( test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -2639,6 +2669,7 @@ async fn delegated_turn_user_role_echo_does_not_redelegate_and_still_forwards_au test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -2784,6 +2815,7 @@ async fn inbound_handoff_request_does_not_block_realtime_event_forwarding() -> R test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -2914,6 +2946,7 @@ async fn inbound_handoff_request_steers_active_turn() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -3065,6 +3098,7 @@ async fn inbound_handoff_request_starts_turn_and_does_not_block_realtime_audio() test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, diff --git a/codex-rs/protocol/src/protocol.rs b/codex-rs/protocol/src/protocol.rs index 1fc707469e0..ba56cc35084 100644 --- a/codex-rs/protocol/src/protocol.rs +++ b/codex-rs/protocol/src/protocol.rs @@ -134,6 +134,8 @@ pub struct McpServerRefreshConfig { #[derive(Debug, Clone, Deserialize, Serialize, PartialEq, JsonSchema, TS)] pub struct ConversationStartParams { + /// Selects whether the realtime session should produce text or audio output. + pub connection: RealtimeConnection, #[serde( default, deserialize_with = "conversation_start_prompt_serde::deserialize", @@ -157,6 +159,13 @@ pub enum ConversationStartTransport { Webrtc { sdp: String }, } +#[derive(Debug, Clone, Copy, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] +#[serde(rename_all = "snake_case")] +pub enum RealtimeConnection { + Text, + Audio, +} + mod conversation_start_prompt_serde { use serde::Deserializer; use serde::Serializer; @@ -285,9 +294,17 @@ pub struct RealtimeAudioFrame { pub item_id: Option, } +#[derive(Debug, Clone, Copy, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] +#[serde(rename_all = "snake_case")] +pub enum RealtimeTranscriptUpdateKind { + Delta, + Done, +} + #[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] pub struct RealtimeTranscriptDelta { pub delta: String, + pub update_kind: RealtimeTranscriptUpdateKind, } #[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] @@ -4586,12 +4603,14 @@ mod tests { }, }); let start = Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(Some("be helpful".to_string())), session_id: Some("conv_1".to_string()), transport: None, voice: None, }); let webrtc_start = Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(Some("be helpful".to_string())), session_id: Some("conv_1".to_string()), transport: Some(ConversationStartTransport::Webrtc { @@ -4604,12 +4623,14 @@ mod tests { }); let close = Op::RealtimeConversationClose; let default_prompt_start = Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: None, session_id: None, transport: None, voice: None, }); let null_prompt_start = Op::RealtimeConversationStart(ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: Some(None), session_id: None, transport: None, @@ -4621,6 +4642,7 @@ mod tests { serde_json::to_value(&start).unwrap(), json!({ "type": "realtime_conversation_start", + "connection": "audio", "prompt": "be helpful", "session_id": "conv_1" }) @@ -4628,19 +4650,22 @@ mod tests { assert_eq!( serde_json::to_value(&default_prompt_start).unwrap(), json!({ - "type": "realtime_conversation_start" + "type": "realtime_conversation_start", + "connection": "audio" }) ); assert_eq!( serde_json::to_value(&null_prompt_start).unwrap(), json!({ "type": "realtime_conversation_start", + "connection": "audio", "prompt": null }) ); assert_eq!( serde_json::from_value::(json!({ - "type": "realtime_conversation_start" + "type": "realtime_conversation_start", + "connection": "audio" })) .unwrap(), default_prompt_start @@ -4648,6 +4673,7 @@ mod tests { assert_eq!( serde_json::from_value::(json!({ "type": "realtime_conversation_start", + "connection": "audio", "prompt": null })) .unwrap(), @@ -4693,6 +4719,7 @@ mod tests { serde_json::to_value(&webrtc_start).unwrap(), json!({ "type": "realtime_conversation_start", + "connection": "audio", "prompt": "be helpful", "session_id": "conv_1", "transport": { diff --git a/codex-rs/tui/src/app_server_session.rs b/codex-rs/tui/src/app_server_session.rs index 5f75bd390a6..b86e9829e08 100644 --- a/codex-rs/tui/src/app_server_session.rs +++ b/codex-rs/tui/src/app_server_session.rs @@ -657,6 +657,7 @@ impl AppServerSession { request_id, params: ThreadRealtimeStartParams { thread_id: thread_id.to_string(), + connection: params.connection, prompt: params.prompt, session_id: params.session_id, voice: params.voice, diff --git a/codex-rs/tui/src/chatwidget/realtime.rs b/codex-rs/tui/src/chatwidget/realtime.rs index 03a59c224ca..88c2eab9535 100644 --- a/codex-rs/tui/src/chatwidget/realtime.rs +++ b/codex-rs/tui/src/chatwidget/realtime.rs @@ -3,6 +3,7 @@ use codex_config::config_toml::RealtimeTransport; use codex_protocol::protocol::ConversationStartParams; use codex_protocol::protocol::ConversationStartTransport; use codex_protocol::protocol::RealtimeAudioFrame; +use codex_protocol::protocol::RealtimeConnection; use codex_protocol::protocol::RealtimeConversationClosedEvent; use codex_protocol::protocol::RealtimeConversationRealtimeEvent; use codex_protocol::protocol::RealtimeConversationStartedEvent; @@ -236,6 +237,7 @@ impl ChatWidget { ) { self.submit_op(AppCommand::realtime_conversation_start( ConversationStartParams { + connection: RealtimeConnection::Audio, prompt: None, session_id: None, transport, From d405279c7285011ff89487a977506122e6fa6eb8 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Mon, 13 Apr 2026 15:28:31 -0700 Subject: [PATCH 02/17] Avoid spelling fixture in realtime transcript test --- .../tests/suite/v2/realtime_conversation.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs b/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs index 13b3f339e86..60fc8efd454 100644 --- a/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs +++ b/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs @@ -738,15 +738,15 @@ async fn realtime_text_connection_requests_text_output_and_final_transcript() -> }), json!({ "type": "response.output_text.delta", - "delta": "hel" + "delta": "hello " }), json!({ "type": "response.output_text.delta", - "delta": "lo" + "delta": "world" }), json!({ "type": "response.output_text.done", - "text": "hello" + "text": "hello world" }), ]]]) .await; @@ -820,19 +820,19 @@ async fn realtime_text_connection_requests_text_output_and_final_transcript() -> ThreadRealtimeTranscriptUpdatedNotification { thread_id: thread_start.thread.id.clone(), role: "assistant".to_string(), - text: "hel".to_string(), + text: "hello ".to_string(), update_kind: RealtimeTranscriptUpdateKind::Delta, }, ThreadRealtimeTranscriptUpdatedNotification { thread_id: thread_start.thread.id.clone(), role: "assistant".to_string(), - text: "lo".to_string(), + text: "world".to_string(), update_kind: RealtimeTranscriptUpdateKind::Delta, }, ThreadRealtimeTranscriptUpdatedNotification { thread_id: thread_start.thread.id, role: "assistant".to_string(), - text: "hello".to_string(), + text: "hello world".to_string(), update_kind: RealtimeTranscriptUpdateKind::Done, }, ] From 4d57fd1caa103ee91612636244f4680a72aec025 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Mon, 13 Apr 2026 15:34:39 -0700 Subject: [PATCH 03/17] Rename realtime start field to output modality --- .../schema/json/ClientRequest.json | 2 +- .../codex_app_server_protocol.schemas.json | 12 ++-- .../codex_app_server_protocol.v2.schemas.json | 12 ++-- ...onnection.ts => RealtimeOutputModality.ts} | 2 +- .../schema/typescript/index.ts | 2 +- .../src/protocol/common.rs | 20 +++--- .../app-server-protocol/src/protocol/v2.rs | 4 +- codex-rs/app-server/README.md | 4 +- .../app-server/src/codex_message_processor.rs | 2 +- .../tests/suite/v2/experimental_api.rs | 6 +- .../tests/suite/v2/realtime_conversation.rs | 18 ++--- codex-rs/codex-api/src/endpoint/mod.rs | 2 +- .../codex-api/src/endpoint/realtime_call.rs | 4 +- .../endpoint/realtime_websocket/methods.rs | 18 ++--- .../realtime_websocket/methods_common.rs | 8 +-- .../endpoint/realtime_websocket/methods_v2.rs | 14 ++-- .../src/endpoint/realtime_websocket/mod.rs | 2 +- .../endpoint/realtime_websocket/protocol.rs | 4 +- codex-rs/codex-api/src/lib.rs | 2 +- .../codex-api/tests/realtime_websocket_e2e.rs | 14 ++-- codex-rs/core/src/realtime_conversation.rs | 12 ++-- codex-rs/core/tests/suite/compact_remote.rs | 4 +- .../core/tests/suite/realtime_conversation.rs | 68 +++++++++---------- codex-rs/protocol/src/protocol.rs | 24 +++---- codex-rs/tui/src/app_server_session.rs | 2 +- codex-rs/tui/src/chatwidget/realtime.rs | 4 +- 26 files changed, 133 insertions(+), 133 deletions(-) rename codex-rs/app-server-protocol/schema/typescript/{RealtimeConnection.ts => RealtimeOutputModality.ts} (73%) diff --git a/codex-rs/app-server-protocol/schema/json/ClientRequest.json b/codex-rs/app-server-protocol/schema/json/ClientRequest.json index a0c9bc90b47..7a20865f054 100644 --- a/codex-rs/app-server-protocol/schema/json/ClientRequest.json +++ b/codex-rs/app-server-protocol/schema/json/ClientRequest.json @@ -1499,7 +1499,7 @@ } ] }, - "RealtimeConnection": { + "RealtimeOutputModality": { "enum": [ "text", "audio" diff --git a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json index 30d4e3b92bb..7446569e2cf 100644 --- a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json +++ b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json @@ -10608,17 +10608,17 @@ } ] }, - "RealtimeConnection": { + "RealtimeConversationVersion": { "enum": [ - "text", - "audio" + "v1", + "v2" ], "type": "string" }, - "RealtimeConversationVersion": { + "RealtimeOutputModality": { "enum": [ - "v1", - "v2" + "text", + "audio" ], "type": "string" }, diff --git a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json index d1ba7646c9a..139125006d3 100644 --- a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json +++ b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json @@ -7404,17 +7404,17 @@ } ] }, - "RealtimeConnection": { + "RealtimeConversationVersion": { "enum": [ - "text", - "audio" + "v1", + "v2" ], "type": "string" }, - "RealtimeConversationVersion": { + "RealtimeOutputModality": { "enum": [ - "v1", - "v2" + "text", + "audio" ], "type": "string" }, diff --git a/codex-rs/app-server-protocol/schema/typescript/RealtimeConnection.ts b/codex-rs/app-server-protocol/schema/typescript/RealtimeOutputModality.ts similarity index 73% rename from codex-rs/app-server-protocol/schema/typescript/RealtimeConnection.ts rename to codex-rs/app-server-protocol/schema/typescript/RealtimeOutputModality.ts index bcd9e1b00ec..78e00e7143d 100644 --- a/codex-rs/app-server-protocol/schema/typescript/RealtimeConnection.ts +++ b/codex-rs/app-server-protocol/schema/typescript/RealtimeOutputModality.ts @@ -2,4 +2,4 @@ // This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually. -export type RealtimeConnection = "text" | "audio"; +export type RealtimeOutputModality = "text" | "audio"; diff --git a/codex-rs/app-server-protocol/schema/typescript/index.ts b/codex-rs/app-server-protocol/schema/typescript/index.ts index d3d4c57e62d..fe9c81917bd 100644 --- a/codex-rs/app-server-protocol/schema/typescript/index.ts +++ b/codex-rs/app-server-protocol/schema/typescript/index.ts @@ -48,8 +48,8 @@ export type { NetworkPolicyRuleAction } from "./NetworkPolicyRuleAction"; export type { ParsedCommand } from "./ParsedCommand"; export type { Personality } from "./Personality"; export type { PlanType } from "./PlanType"; -export type { RealtimeConnection } from "./RealtimeConnection"; export type { RealtimeConversationVersion } from "./RealtimeConversationVersion"; +export type { RealtimeOutputModality } from "./RealtimeOutputModality"; export type { RealtimeTranscriptUpdateKind } from "./RealtimeTranscriptUpdateKind"; export type { RealtimeVoice } from "./RealtimeVoice"; export type { RealtimeVoicesList } from "./RealtimeVoicesList"; diff --git a/codex-rs/app-server-protocol/src/protocol/common.rs b/codex-rs/app-server-protocol/src/protocol/common.rs index 04ac2a5742d..3846d5984f2 100644 --- a/codex-rs/app-server-protocol/src/protocol/common.rs +++ b/codex-rs/app-server-protocol/src/protocol/common.rs @@ -1050,8 +1050,8 @@ mod tests { use codex_protocol::ThreadId; use codex_protocol::account::PlanType; use codex_protocol::parse_command::ParsedCommand; - use codex_protocol::protocol::RealtimeConnection; use codex_protocol::protocol::RealtimeConversationVersion; + use codex_protocol::protocol::RealtimeOutputModality; use codex_protocol::protocol::RealtimeVoice; use codex_utils_absolute_path::AbsolutePathBuf; use pretty_assertions::assert_eq; @@ -1781,7 +1781,7 @@ mod tests { request_id: RequestId::Integer(9), params: v2::ThreadRealtimeStartParams { thread_id: "thr_123".to_string(), - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("You are on a call".to_string())), session_id: Some("sess_456".to_string()), transport: None, @@ -1794,7 +1794,7 @@ mod tests { "id": 9, "params": { "threadId": "thr_123", - "connection": "audio", + "outputModality": "audio", "prompt": "You are on a call", "sessionId": "sess_456", "transport": null, @@ -1812,7 +1812,7 @@ mod tests { request_id: RequestId::Integer(9), params: v2::ThreadRealtimeStartParams { thread_id: "thr_123".to_string(), - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: None, session_id: None, transport: None, @@ -1825,7 +1825,7 @@ mod tests { "id": 9, "params": { "threadId": "thr_123", - "connection": "audio", + "outputModality": "audio", "sessionId": null, "transport": null, "voice": null @@ -1838,7 +1838,7 @@ mod tests { request_id: RequestId::Integer(9), params: v2::ThreadRealtimeStartParams { thread_id: "thr_123".to_string(), - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(None), session_id: None, transport: None, @@ -1851,7 +1851,7 @@ mod tests { "id": 9, "params": { "threadId": "thr_123", - "connection": "audio", + "outputModality": "audio", "prompt": null, "sessionId": null, "transport": null, @@ -1866,7 +1866,7 @@ mod tests { "id": 9, "params": { "threadId": "thr_123", - "connection": "audio", + "outputModality": "audio", "sessionId": null, "transport": null, "voice": null @@ -1882,7 +1882,7 @@ mod tests { "id": 9, "params": { "threadId": "thr_123", - "connection": "audio", + "outputModality": "audio", "prompt": null, "sessionId": null, "transport": null, @@ -1967,7 +1967,7 @@ mod tests { request_id: RequestId::Integer(1), params: v2::ThreadRealtimeStartParams { thread_id: "thr_123".to_string(), - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("You are on a call".to_string())), session_id: None, transport: None, diff --git a/codex-rs/app-server-protocol/src/protocol/v2.rs b/codex-rs/app-server-protocol/src/protocol/v2.rs index 7bd8206f3fe..2c91b7d728b 100644 --- a/codex-rs/app-server-protocol/src/protocol/v2.rs +++ b/codex-rs/app-server-protocol/src/protocol/v2.rs @@ -73,8 +73,8 @@ use codex_protocol::protocol::RateLimitSnapshot as CoreRateLimitSnapshot; use codex_protocol::protocol::RateLimitWindow as CoreRateLimitWindow; use codex_protocol::protocol::ReadOnlyAccess as CoreReadOnlyAccess; use codex_protocol::protocol::RealtimeAudioFrame as CoreRealtimeAudioFrame; -use codex_protocol::protocol::RealtimeConnection; use codex_protocol::protocol::RealtimeConversationVersion; +use codex_protocol::protocol::RealtimeOutputModality; use codex_protocol::protocol::RealtimeTranscriptUpdateKind; use codex_protocol::protocol::RealtimeVoice; use codex_protocol::protocol::RealtimeVoicesList; @@ -3963,7 +3963,7 @@ pub struct ThreadRealtimeStartParams { pub thread_id: String, /// Selects text or audio output for the realtime session. Transport and voice stay /// independent so clients can choose how they connect separately from what the model emits. - pub connection: RealtimeConnection, + pub output_modality: RealtimeOutputModality, #[serde( default, deserialize_with = "super::serde_helpers::deserialize_double_option", diff --git a/codex-rs/app-server/README.md b/codex-rs/app-server/README.md index f67ac7596c5..cf820e28aa4 100644 --- a/codex-rs/app-server/README.md +++ b/codex-rs/app-server/README.md @@ -153,7 +153,7 @@ Example with notification opt-out: - `turn/start` — add user input to a thread and begin Codex generation; responds with the initial `turn` object and streams `turn/started`, `item/*`, and `turn/completed` notifications. For `collaborationMode`, `settings.developer_instructions: null` means "use built-in instructions for the selected mode". - `turn/steer` — add user input to an already in-flight regular turn without starting a new turn; returns the active `turnId` that accepted the input. Review and manual compaction turns reject `turn/steer`. - `turn/interrupt` — request cancellation of an in-flight turn by `(thread_id, turn_id)`; success is an empty `{}` response and the turn finishes with `status: "interrupted"`. -- `thread/realtime/start` — start a thread-scoped realtime session (experimental); pass `connection: "text"` or `connection: "audio"` to choose model output, returns `{}` and streams `thread/realtime/*` notifications. Omit `transport` for the websocket transport, or pass `{ "type": "webrtc", "sdp": "..." }` to create a WebRTC session from a browser-generated SDP offer; the remote answer SDP is emitted as `thread/realtime/sdp`. +- `thread/realtime/start` — start a thread-scoped realtime session (experimental); pass `outputModality: "text"` or `outputModality: "audio"` to choose model output, returns `{}` and streams `thread/realtime/*` notifications. Omit `transport` for the websocket transport, or pass `{ "type": "webrtc", "sdp": "..." }` to create a WebRTC session from a browser-generated SDP offer; the remote answer SDP is emitted as `thread/realtime/sdp`. - `thread/realtime/appendAudio` — append an input audio chunk to the active realtime session (experimental); returns `{}`. - `thread/realtime/appendText` — append text input to the active realtime session (experimental); returns `{}`. - `thread/realtime/stop` — stop the active realtime session for the thread (experimental); returns `{}`. @@ -608,7 +608,7 @@ Then send `offer.sdp` to app-server. Core uses `experimental_realtime_ws_backend ```json { "method": "thread/realtime/start", "id": 40, "params": { "threadId": "thr_123", - "connection": "audio", + "outputModality": "audio", "prompt": "You are on a call.", "sessionId": null, "transport": { "type": "webrtc", "sdp": "v=0\r\no=..." } diff --git a/codex-rs/app-server/src/codex_message_processor.rs b/codex-rs/app-server/src/codex_message_processor.rs index 7ab7e42070c..b9e1f65b81b 100644 --- a/codex-rs/app-server/src/codex_message_processor.rs +++ b/codex-rs/app-server/src/codex_message_processor.rs @@ -7163,7 +7163,7 @@ impl CodexMessageProcessor { &request_id, thread.as_ref(), Op::RealtimeConversationStart(ConversationStartParams { - connection: params.connection, + output_modality: params.output_modality, prompt: params.prompt, session_id: params.session_id, transport: params.transport.map(|transport| match transport { diff --git a/codex-rs/app-server/tests/suite/v2/experimental_api.rs b/codex-rs/app-server/tests/suite/v2/experimental_api.rs index 4dc400984a1..2fd457faf23 100644 --- a/codex-rs/app-server/tests/suite/v2/experimental_api.rs +++ b/codex-rs/app-server/tests/suite/v2/experimental_api.rs @@ -17,7 +17,7 @@ use codex_app_server_protocol::ThreadRealtimeStartParams; use codex_app_server_protocol::ThreadRealtimeStartTransport; use codex_app_server_protocol::ThreadStartParams; use codex_app_server_protocol::ThreadStartResponse; -use codex_protocol::protocol::RealtimeConnection; +use codex_protocol::protocol::RealtimeOutputModality; use pretty_assertions::assert_eq; use std::path::Path; use std::time::Duration; @@ -77,7 +77,7 @@ async fn realtime_conversation_start_requires_experimental_api_capability() -> R let request_id = mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { thread_id: "thr_123".to_string(), - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("hello".to_string())), session_id: None, transport: None, @@ -147,7 +147,7 @@ async fn realtime_webrtc_start_requires_experimental_api_capability() -> Result< let request_id = mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { thread_id: "thr_123".to_string(), - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("hello".to_string())), session_id: None, transport: Some(ThreadRealtimeStartTransport::Webrtc { diff --git a/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs b/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs index 60fc8efd454..47d17a9aa1d 100644 --- a/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs +++ b/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs @@ -38,8 +38,8 @@ use codex_app_server_protocol::TurnCompletedNotification; use codex_app_server_protocol::TurnStartedNotification; use codex_features::FEATURES; use codex_features::Feature; -use codex_protocol::protocol::RealtimeConnection; use codex_protocol::protocol::RealtimeConversationVersion; +use codex_protocol::protocol::RealtimeOutputModality; use codex_protocol::protocol::RealtimeTranscriptUpdateKind; use codex_protocol::protocol::RealtimeVoice; use codex_protocol::protocol::RealtimeVoicesList; @@ -303,7 +303,7 @@ impl RealtimeE2eHarness { .mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { thread_id: self.thread_id.clone(), - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: Some(ThreadRealtimeStartTransport::Webrtc { @@ -530,7 +530,7 @@ async fn realtime_conversation_streams_v2_notifications() -> Result<()> { let start_request_id = mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { thread_id: thread_start.thread.id.clone(), - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: None, session_id: None, transport: None, @@ -727,7 +727,7 @@ async fn realtime_conversation_streams_v2_notifications() -> Result<()> { } #[tokio::test] -async fn realtime_text_connection_requests_text_output_and_final_transcript() -> Result<()> { +async fn realtime_text_output_modality_requests_text_output_and_final_transcript() -> Result<()> { skip_if_no_network!(Ok(())); let responses_server = create_mock_responses_server_sequence_unchecked(Vec::new()).await; @@ -777,7 +777,7 @@ async fn realtime_text_connection_requests_text_output_and_final_transcript() -> let start_request_id = mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { thread_id: thread_start.thread.id.clone(), - connection: RealtimeConnection::Text, + output_modality: RealtimeOutputModality::Text, prompt: None, session_id: None, transport: None, @@ -942,7 +942,7 @@ async fn realtime_conversation_stop_emits_closed_notification() -> Result<()> { let start_request_id = mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { thread_id: thread_start.thread.id.clone(), - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -1039,7 +1039,7 @@ async fn realtime_webrtc_start_emits_sdp_notification() -> Result<()> { let start_request_id = mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { thread_id: thread_id.clone(), - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: Some(ThreadRealtimeStartTransport::Webrtc { @@ -1889,7 +1889,7 @@ async fn realtime_webrtc_start_surfaces_backend_error() -> Result<()> { let start_request_id = mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { thread_id: thread_start.thread.id, - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: Some(ThreadRealtimeStartTransport::Webrtc { @@ -1948,7 +1948,7 @@ async fn realtime_conversation_requires_feature_flag() -> Result<()> { let start_request_id = mcp .send_thread_realtime_start_request(ThreadRealtimeStartParams { thread_id: thread_start.thread.id.clone(), - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, diff --git a/codex-rs/codex-api/src/endpoint/mod.rs b/codex-rs/codex-api/src/endpoint/mod.rs index 815c3fbba15..515487074f2 100644 --- a/codex-rs/codex-api/src/endpoint/mod.rs +++ b/codex-rs/codex-api/src/endpoint/mod.rs @@ -12,8 +12,8 @@ pub use memories::MemoriesClient; pub use models::ModelsClient; pub use realtime_call::RealtimeCallClient; pub use realtime_call::RealtimeCallResponse; -pub use realtime_websocket::RealtimeConnection; pub use realtime_websocket::RealtimeEventParser; +pub use realtime_websocket::RealtimeOutputModality; pub use realtime_websocket::RealtimeSessionConfig; pub use realtime_websocket::RealtimeSessionMode; pub use realtime_websocket::RealtimeTranscriptUpdateKind; diff --git a/codex-rs/codex-api/src/endpoint/realtime_call.rs b/codex-rs/codex-api/src/endpoint/realtime_call.rs index 1900c3443ee..fbcdbf519ee 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_call.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_call.rs @@ -218,8 +218,8 @@ fn decode_call_id_from_location(headers: &HeaderMap) -> Result #[cfg(test)] mod tests { use super::*; - use crate::endpoint::realtime_websocket::RealtimeConnection; use crate::endpoint::realtime_websocket::RealtimeEventParser; + use crate::endpoint::realtime_websocket::RealtimeOutputModality; use crate::endpoint::realtime_websocket::RealtimeSessionMode; use crate::provider::RetryConfig; use async_trait::async_trait; @@ -310,7 +310,7 @@ mod tests { session_id: Some(session_id.to_string()), event_parser: RealtimeEventParser::RealtimeV2, session_mode: RealtimeSessionMode::Conversational, - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, voice: RealtimeVoice::Marin, } } diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs index f4d3f0648ee..36bb8cecd0c 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs @@ -4,10 +4,10 @@ use crate::endpoint::realtime_websocket::methods_common::normalized_session_mode use crate::endpoint::realtime_websocket::methods_common::session_update_session; use crate::endpoint::realtime_websocket::methods_common::websocket_intent; use crate::endpoint::realtime_websocket::protocol::RealtimeAudioFrame; -use crate::endpoint::realtime_websocket::protocol::RealtimeConnection; use crate::endpoint::realtime_websocket::protocol::RealtimeEvent; use crate::endpoint::realtime_websocket::protocol::RealtimeEventParser; use crate::endpoint::realtime_websocket::protocol::RealtimeOutboundMessage; +use crate::endpoint::realtime_websocket::protocol::RealtimeOutputModality; use crate::endpoint::realtime_websocket::protocol::RealtimeSessionConfig; use crate::endpoint::realtime_websocket::protocol::RealtimeSessionMode; use crate::endpoint::realtime_websocket::protocol::RealtimeTranscriptDelta; @@ -309,7 +309,7 @@ impl RealtimeWebsocketWriter { &self, instructions: String, session_mode: RealtimeSessionMode, - connection: RealtimeConnection, + output_modality: RealtimeOutputModality, voice: RealtimeVoice, ) -> Result<(), ApiError> { let session_mode = normalized_session_mode(self.event_parser, session_mode); @@ -317,7 +317,7 @@ impl RealtimeWebsocketWriter { self.event_parser, instructions, session_mode, - connection, + output_modality, voice, ); self.send_json(&RealtimeOutboundMessage::SessionUpdate { session }) @@ -628,7 +628,7 @@ impl RealtimeWebsocketClient { .send_session_update( config.instructions, config.session_mode, - config.connection, + config.output_modality, config.voice, ) .await?; @@ -1445,7 +1445,7 @@ mod tests { session_id: Some("conv_1".to_string()), event_parser: RealtimeEventParser::V1, session_mode: RealtimeSessionMode::Conversational, - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, voice: RealtimeVoice::Breeze, }, HeaderMap::new(), @@ -1723,7 +1723,7 @@ mod tests { session_id: Some("conv_1".to_string()), event_parser: RealtimeEventParser::RealtimeV2, session_mode: RealtimeSessionMode::Conversational, - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, voice: RealtimeVoice::Cedar, }, HeaderMap::new(), @@ -1829,7 +1829,7 @@ mod tests { session_id: Some("conv_1".to_string()), event_parser: RealtimeEventParser::RealtimeV2, session_mode: RealtimeSessionMode::Transcription, - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, voice: RealtimeVoice::Marin, }, HeaderMap::new(), @@ -1933,7 +1933,7 @@ mod tests { session_id: Some("conv_1".to_string()), event_parser: RealtimeEventParser::V1, session_mode: RealtimeSessionMode::Transcription, - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, voice: RealtimeVoice::Cove, }, HeaderMap::new(), @@ -2023,7 +2023,7 @@ mod tests { session_id: Some("conv_1".to_string()), event_parser: RealtimeEventParser::V1, session_mode: RealtimeSessionMode::Conversational, - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, voice: RealtimeVoice::Cove, }, HeaderMap::new(), diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_common.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_common.rs index fe4ca8d2cd8..67345bdc7ea 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_common.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_common.rs @@ -6,9 +6,9 @@ use crate::endpoint::realtime_websocket::methods_v2::conversation_handoff_append use crate::endpoint::realtime_websocket::methods_v2::conversation_item_create_message as v2_conversation_item_create_message; use crate::endpoint::realtime_websocket::methods_v2::session_update_session as v2_session_update_session; use crate::endpoint::realtime_websocket::methods_v2::websocket_intent as v2_websocket_intent; -use crate::endpoint::realtime_websocket::protocol::RealtimeConnection; use crate::endpoint::realtime_websocket::protocol::RealtimeEventParser; use crate::endpoint::realtime_websocket::protocol::RealtimeOutboundMessage; +use crate::endpoint::realtime_websocket::protocol::RealtimeOutputModality; use crate::endpoint::realtime_websocket::protocol::RealtimeSessionConfig; use crate::endpoint::realtime_websocket::protocol::RealtimeSessionMode; use crate::endpoint::realtime_websocket::protocol::RealtimeVoice; @@ -58,14 +58,14 @@ pub(super) fn session_update_session( event_parser: RealtimeEventParser, instructions: String, session_mode: RealtimeSessionMode, - connection: RealtimeConnection, + output_modality: RealtimeOutputModality, voice: RealtimeVoice, ) -> SessionUpdateSession { let session_mode = normalized_session_mode(event_parser, session_mode); match event_parser { RealtimeEventParser::V1 => v1_session_update_session(instructions, voice), RealtimeEventParser::RealtimeV2 => { - v2_session_update_session(instructions, session_mode, connection, voice) + v2_session_update_session(instructions, session_mode, output_modality, voice) } } } @@ -75,7 +75,7 @@ pub fn session_update_session_json(config: RealtimeSessionConfig) -> JsonResult< config.event_parser, config.instructions, config.session_mode, - config.connection, + config.output_modality, config.voice, ); session.id = config.session_id; diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_v2.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_v2.rs index 4b2f0323c8d..204909b7f68 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_v2.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods_v2.rs @@ -8,8 +8,8 @@ use crate::endpoint::realtime_websocket::protocol::ConversationItemType; use crate::endpoint::realtime_websocket::protocol::ConversationMessageItem; use crate::endpoint::realtime_websocket::protocol::ConversationRole; use crate::endpoint::realtime_websocket::protocol::NoiseReductionType; -use crate::endpoint::realtime_websocket::protocol::RealtimeConnection; use crate::endpoint::realtime_websocket::protocol::RealtimeOutboundMessage; +use crate::endpoint::realtime_websocket::protocol::RealtimeOutputModality; use crate::endpoint::realtime_websocket::protocol::RealtimeSessionMode; use crate::endpoint::realtime_websocket::protocol::RealtimeVoice; use crate::endpoint::realtime_websocket::protocol::SessionAudio; @@ -61,7 +61,7 @@ pub(super) fn conversation_handoff_append_message( pub(super) fn session_update_session( instructions: String, session_mode: RealtimeSessionMode, - connection: RealtimeConnection, + output_modality: RealtimeOutputModality, voice: RealtimeVoice, ) -> SessionUpdateSession { match session_mode { @@ -70,7 +70,7 @@ pub(super) fn session_update_session( r#type: SessionType::Realtime, model: None, instructions: Some(instructions), - output_modalities: Some(vec![output_modality(connection).to_string()]), + output_modalities: Some(vec![output_modality_value(output_modality).to_string()]), audio: SessionAudio { input: SessionAudioInput { format: SessionAudioFormat { @@ -135,10 +135,10 @@ pub(super) fn session_update_session( } } -fn output_modality(connection: RealtimeConnection) -> &'static str { - match connection { - RealtimeConnection::Text => REALTIME_V2_OUTPUT_MODALITY_TEXT, - RealtimeConnection::Audio => REALTIME_V2_OUTPUT_MODALITY_AUDIO, +fn output_modality_value(output_modality: RealtimeOutputModality) -> &'static str { + match output_modality { + RealtimeOutputModality::Text => REALTIME_V2_OUTPUT_MODALITY_TEXT, + RealtimeOutputModality::Audio => REALTIME_V2_OUTPUT_MODALITY_AUDIO, } } diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/mod.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/mod.rs index c93f6cf2efa..254849b7fd8 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/mod.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/mod.rs @@ -12,8 +12,8 @@ pub use methods::RealtimeWebsocketConnection; pub use methods::RealtimeWebsocketEvents; pub use methods::RealtimeWebsocketWriter; pub use methods_common::session_update_session_json; -pub use protocol::RealtimeConnection; pub use protocol::RealtimeEventParser; +pub use protocol::RealtimeOutputModality; pub use protocol::RealtimeSessionConfig; pub use protocol::RealtimeSessionMode; pub use protocol::RealtimeTranscriptUpdateKind; diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol.rs index 1aa7e1473a3..b193de2e16e 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol.rs @@ -1,8 +1,8 @@ use crate::endpoint::realtime_websocket::protocol_v1::parse_realtime_event_v1; use crate::endpoint::realtime_websocket::protocol_v2::parse_realtime_event_v2; pub use codex_protocol::protocol::RealtimeAudioFrame; -pub use codex_protocol::protocol::RealtimeConnection; pub use codex_protocol::protocol::RealtimeEvent; +pub use codex_protocol::protocol::RealtimeOutputModality; pub use codex_protocol::protocol::RealtimeTranscriptDelta; pub use codex_protocol::protocol::RealtimeTranscriptEntry; pub use codex_protocol::protocol::RealtimeTranscriptUpdateKind; @@ -29,7 +29,7 @@ pub struct RealtimeSessionConfig { pub session_id: Option, pub event_parser: RealtimeEventParser, pub session_mode: RealtimeSessionMode, - pub connection: RealtimeConnection, + pub output_modality: RealtimeOutputModality, pub voice: RealtimeVoice, } diff --git a/codex-rs/codex-api/src/lib.rs b/codex-rs/codex-api/src/lib.rs index bd592bfceed..1a280c8bf41 100644 --- a/codex-rs/codex-api/src/lib.rs +++ b/codex-rs/codex-api/src/lib.rs @@ -40,8 +40,8 @@ pub use crate::endpoint::MemoriesClient; pub use crate::endpoint::ModelsClient; pub use crate::endpoint::RealtimeCallClient; pub use crate::endpoint::RealtimeCallResponse; -pub use crate::endpoint::RealtimeConnection; pub use crate::endpoint::RealtimeEventParser; +pub use crate::endpoint::RealtimeOutputModality; pub use crate::endpoint::RealtimeSessionConfig; pub use crate::endpoint::RealtimeSessionMode; pub use crate::endpoint::RealtimeTranscriptUpdateKind; diff --git a/codex-rs/codex-api/tests/realtime_websocket_e2e.rs b/codex-rs/codex-api/tests/realtime_websocket_e2e.rs index c03743d776f..abafaef2aed 100644 --- a/codex-rs/codex-api/tests/realtime_websocket_e2e.rs +++ b/codex-rs/codex-api/tests/realtime_websocket_e2e.rs @@ -4,9 +4,9 @@ use std::time::Duration; use codex_api::Provider; use codex_api::RealtimeAudioFrame; -use codex_api::RealtimeConnection; use codex_api::RealtimeEvent; use codex_api::RealtimeEventParser; +use codex_api::RealtimeOutputModality; use codex_api::RealtimeSessionConfig; use codex_api::RealtimeSessionMode; use codex_api::RealtimeWebsocketClient; @@ -146,7 +146,7 @@ async fn realtime_ws_e2e_session_create_and_event_flow() { session_id: Some("conv_123".to_string()), event_parser: RealtimeEventParser::V1, session_mode: RealtimeSessionMode::Conversational, - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, voice: RealtimeVoice::Cove, }, HeaderMap::new(), @@ -250,7 +250,7 @@ async fn realtime_ws_connect_webrtc_sideband_retries_join_until_server_is_availa session_id: Some("conv_123".to_string()), event_parser: RealtimeEventParser::RealtimeV2, session_mode: RealtimeSessionMode::Conversational, - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, voice: RealtimeVoice::Marin, }, "rtc_test", @@ -322,7 +322,7 @@ async fn realtime_ws_e2e_send_while_next_event_waits() { session_id: Some("conv_123".to_string()), event_parser: RealtimeEventParser::V1, session_mode: RealtimeSessionMode::Conversational, - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, voice: RealtimeVoice::Cove, }, HeaderMap::new(), @@ -390,7 +390,7 @@ async fn realtime_ws_e2e_disconnected_emitted_once() { session_id: Some("conv_123".to_string()), event_parser: RealtimeEventParser::V1, session_mode: RealtimeSessionMode::Conversational, - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, voice: RealtimeVoice::Cove, }, HeaderMap::new(), @@ -454,7 +454,7 @@ async fn realtime_ws_e2e_ignores_unknown_text_events() { session_id: Some("conv_123".to_string()), event_parser: RealtimeEventParser::V1, session_mode: RealtimeSessionMode::Conversational, - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, voice: RealtimeVoice::Cove, }, HeaderMap::new(), @@ -521,7 +521,7 @@ async fn realtime_ws_e2e_realtime_v2_parser_emits_handoff_requested() { session_id: Some("conv_123".to_string()), event_parser: RealtimeEventParser::RealtimeV2, session_mode: RealtimeSessionMode::Conversational, - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, voice: RealtimeVoice::Marin, }, HeaderMap::new(), diff --git a/codex-rs/core/src/realtime_conversation.rs b/codex-rs/core/src/realtime_conversation.rs index f1291ef333a..be9568cb3aa 100644 --- a/codex-rs/core/src/realtime_conversation.rs +++ b/codex-rs/core/src/realtime_conversation.rs @@ -37,12 +37,12 @@ use codex_protocol::protocol::ConversationTextParams; use codex_protocol::protocol::ErrorEvent; use codex_protocol::protocol::Event; use codex_protocol::protocol::EventMsg; -use codex_protocol::protocol::RealtimeConnection; use codex_protocol::protocol::RealtimeConversationClosedEvent; use codex_protocol::protocol::RealtimeConversationRealtimeEvent; use codex_protocol::protocol::RealtimeConversationSdpEvent; use codex_protocol::protocol::RealtimeConversationStartedEvent; use codex_protocol::protocol::RealtimeHandoffRequested; +use codex_protocol::protocol::RealtimeOutputModality; use codex_protocol::protocol::RealtimeVoice; use codex_protocol::protocol::RealtimeVoicesList; use http::HeaderMap; @@ -598,7 +598,7 @@ async fn prepare_realtime_start( sess, params.prompt, params.session_id, - params.connection, + params.output_modality, params.voice, ) .await?; @@ -629,7 +629,7 @@ pub(crate) async fn build_realtime_session_config( sess: &Arc, prompt: Option>, session_id: Option, - connection: RealtimeConnection, + output_modality: RealtimeOutputModality, voice: Option, ) -> CodexResult { let config = sess.get_config().await; @@ -662,10 +662,10 @@ pub(crate) async fn build_realtime_session_config( RealtimeWsVersion::V2 => RealtimeEventParser::RealtimeV2, }; if config.realtime.version == RealtimeWsVersion::V1 - && matches!(connection, RealtimeConnection::Text) + && matches!(output_modality, RealtimeOutputModality::Text) { return Err(CodexErr::InvalidRequest( - "text realtime connection requires realtime v2".to_string(), + "text realtime output modality requires realtime v2".to_string(), )); } let session_mode = match config.realtime.session_type { @@ -682,7 +682,7 @@ pub(crate) async fn build_realtime_session_config( session_id: Some(session_id.unwrap_or_else(|| sess.conversation_id.to_string())), event_parser, session_mode, - connection, + output_modality, voice, }) } diff --git a/codex-rs/core/tests/suite/compact_remote.rs b/codex-rs/core/tests/suite/compact_remote.rs index 9e32b4c1a9a..8322046d10f 100644 --- a/codex-rs/core/tests/suite/compact_remote.rs +++ b/codex-rs/core/tests/suite/compact_remote.rs @@ -15,9 +15,9 @@ use codex_protocol::protocol::EventMsg; use codex_protocol::protocol::ItemCompletedEvent; use codex_protocol::protocol::ItemStartedEvent; use codex_protocol::protocol::Op; -use codex_protocol::protocol::RealtimeConnection; use codex_protocol::protocol::RealtimeConversationRealtimeEvent; use codex_protocol::protocol::RealtimeEvent; +use codex_protocol::protocol::RealtimeOutputModality; use codex_protocol::protocol::RolloutItem; use codex_protocol::protocol::RolloutLine; use codex_protocol::user_input::UserInput; @@ -117,7 +117,7 @@ async fn start_remote_realtime_server() -> responses::WebSocketTestServer { async fn start_realtime_conversation(codex: &codex_core::CodexThread) -> Result<()> { codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, diff --git a/codex-rs/core/tests/suite/realtime_conversation.rs b/codex-rs/core/tests/suite/realtime_conversation.rs index fb59578586b..9ee05b49bac 100644 --- a/codex-rs/core/tests/suite/realtime_conversation.rs +++ b/codex-rs/core/tests/suite/realtime_conversation.rs @@ -18,10 +18,10 @@ use codex_protocol::protocol::EventMsg; use codex_protocol::protocol::InitialHistory; use codex_protocol::protocol::Op; use codex_protocol::protocol::RealtimeAudioFrame; -use codex_protocol::protocol::RealtimeConnection; use codex_protocol::protocol::RealtimeConversationRealtimeEvent; use codex_protocol::protocol::RealtimeConversationVersion; use codex_protocol::protocol::RealtimeEvent; +use codex_protocol::protocol::RealtimeOutputModality; use codex_protocol::protocol::RealtimeVoice; use codex_protocol::protocol::RolloutItem; use codex_protocol::protocol::SessionSource; @@ -248,7 +248,7 @@ async fn conversation_start_audio_text_close_round_trip() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -382,7 +382,7 @@ async fn conversation_start_defaults_to_v2_and_gpt_realtime_1_5() -> Result<()> test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -466,7 +466,7 @@ async fn conversation_webrtc_start_posts_generated_session() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: Some(ConversationStartTransport::Webrtc { @@ -604,7 +604,7 @@ async fn conversation_start_uses_openai_env_key_fallback_with_chatgpt_auth() -> test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -666,7 +666,7 @@ async fn conversation_transport_close_emits_closed_event() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -752,7 +752,7 @@ async fn conversation_start_preflight_failure_emits_realtime_error_only() -> Res test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -796,7 +796,7 @@ async fn conversation_start_connect_failure_emits_realtime_error_only() -> Resul test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -887,7 +887,7 @@ async fn conversation_second_start_replaces_runtime() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("old".to_string())), session_id: Some("conv_old".to_string()), transport: None, @@ -906,7 +906,7 @@ async fn conversation_second_start_replaces_runtime() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("new".to_string())), session_id: Some("conv_new".to_string()), transport: None, @@ -996,7 +996,7 @@ async fn conversation_uses_experimental_realtime_ws_base_url_override() -> Resul test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -1054,7 +1054,7 @@ async fn conversation_uses_default_realtime_backend_prompt() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: None, session_id: None, transport: None, @@ -1120,7 +1120,7 @@ async fn conversation_uses_empty_instructions_for_null_or_empty_prompt() -> Resu ] { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt, session_id: None, transport: None, @@ -1179,7 +1179,7 @@ async fn conversation_uses_explicit_start_voice() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -1230,7 +1230,7 @@ async fn conversation_uses_configured_realtime_voice() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -1269,7 +1269,7 @@ async fn conversation_rejects_voice_for_wrong_realtime_version() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -1313,7 +1313,7 @@ async fn conversation_uses_experimental_realtime_ws_backend_prompt_override() -> test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("prompt from op".to_string())), session_id: None, transport: None, @@ -1379,7 +1379,7 @@ async fn conversation_uses_experimental_realtime_ws_startup_context_override() - test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("prompt from op".to_string())), session_id: None, transport: None, @@ -1443,7 +1443,7 @@ async fn conversation_disables_realtime_startup_context_with_empty_override() -> test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("prompt from op".to_string())), session_id: None, transport: None, @@ -1500,7 +1500,7 @@ async fn conversation_start_injects_startup_context_from_thread_history() -> Res test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -1612,7 +1612,7 @@ async fn conversation_startup_context_current_thread_selects_many_turns_by_budge codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -1717,7 +1717,7 @@ async fn conversation_startup_context_falls_back_to_workspace_map() -> Result<() test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -1772,7 +1772,7 @@ async fn conversation_startup_context_is_truncated_and_sent_once_per_start() -> test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -1848,7 +1848,7 @@ async fn conversation_user_text_turn_is_sent_to_realtime_when_active() -> Result test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -1981,7 +1981,7 @@ async fn conversation_mirrors_assistant_message_text_to_realtime_handoff() -> Re test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -2111,7 +2111,7 @@ async fn conversation_handoff_persists_across_item_done_until_turn_complete() -> test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -2256,7 +2256,7 @@ async fn inbound_handoff_request_starts_turn() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -2354,7 +2354,7 @@ async fn inbound_handoff_request_uses_active_transcript() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -2450,7 +2450,7 @@ async fn inbound_handoff_request_clears_active_transcript_after_each_handoff() - test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -2553,7 +2553,7 @@ async fn inbound_conversation_item_does_not_start_turn_and_still_forwards_audio( test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -2669,7 +2669,7 @@ async fn delegated_turn_user_role_echo_does_not_redelegate_and_still_forwards_au test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -2815,7 +2815,7 @@ async fn inbound_handoff_request_does_not_block_realtime_event_forwarding() -> R test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -2946,7 +2946,7 @@ async fn inbound_handoff_request_steers_active_turn() -> Result<()> { test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, @@ -3098,7 +3098,7 @@ async fn inbound_handoff_request_starts_turn_and_does_not_block_realtime_audio() test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, diff --git a/codex-rs/protocol/src/protocol.rs b/codex-rs/protocol/src/protocol.rs index ba56cc35084..83d1fab5369 100644 --- a/codex-rs/protocol/src/protocol.rs +++ b/codex-rs/protocol/src/protocol.rs @@ -135,7 +135,7 @@ pub struct McpServerRefreshConfig { #[derive(Debug, Clone, Deserialize, Serialize, PartialEq, JsonSchema, TS)] pub struct ConversationStartParams { /// Selects whether the realtime session should produce text or audio output. - pub connection: RealtimeConnection, + pub output_modality: RealtimeOutputModality, #[serde( default, deserialize_with = "conversation_start_prompt_serde::deserialize", @@ -161,7 +161,7 @@ pub enum ConversationStartTransport { #[derive(Debug, Clone, Copy, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] #[serde(rename_all = "snake_case")] -pub enum RealtimeConnection { +pub enum RealtimeOutputModality { Text, Audio, } @@ -4603,14 +4603,14 @@ mod tests { }, }); let start = Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("be helpful".to_string())), session_id: Some("conv_1".to_string()), transport: None, voice: None, }); let webrtc_start = Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("be helpful".to_string())), session_id: Some("conv_1".to_string()), transport: Some(ConversationStartTransport::Webrtc { @@ -4623,14 +4623,14 @@ mod tests { }); let close = Op::RealtimeConversationClose; let default_prompt_start = Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: None, session_id: None, transport: None, voice: None, }); let null_prompt_start = Op::RealtimeConversationStart(ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: Some(None), session_id: None, transport: None, @@ -4642,7 +4642,7 @@ mod tests { serde_json::to_value(&start).unwrap(), json!({ "type": "realtime_conversation_start", - "connection": "audio", + "output_modality": "audio", "prompt": "be helpful", "session_id": "conv_1" }) @@ -4651,21 +4651,21 @@ mod tests { serde_json::to_value(&default_prompt_start).unwrap(), json!({ "type": "realtime_conversation_start", - "connection": "audio" + "output_modality": "audio" }) ); assert_eq!( serde_json::to_value(&null_prompt_start).unwrap(), json!({ "type": "realtime_conversation_start", - "connection": "audio", + "output_modality": "audio", "prompt": null }) ); assert_eq!( serde_json::from_value::(json!({ "type": "realtime_conversation_start", - "connection": "audio" + "output_modality": "audio" })) .unwrap(), default_prompt_start @@ -4673,7 +4673,7 @@ mod tests { assert_eq!( serde_json::from_value::(json!({ "type": "realtime_conversation_start", - "connection": "audio", + "output_modality": "audio", "prompt": null })) .unwrap(), @@ -4719,7 +4719,7 @@ mod tests { serde_json::to_value(&webrtc_start).unwrap(), json!({ "type": "realtime_conversation_start", - "connection": "audio", + "output_modality": "audio", "prompt": "be helpful", "session_id": "conv_1", "transport": { diff --git a/codex-rs/tui/src/app_server_session.rs b/codex-rs/tui/src/app_server_session.rs index b86e9829e08..1afa040d70f 100644 --- a/codex-rs/tui/src/app_server_session.rs +++ b/codex-rs/tui/src/app_server_session.rs @@ -657,7 +657,7 @@ impl AppServerSession { request_id, params: ThreadRealtimeStartParams { thread_id: thread_id.to_string(), - connection: params.connection, + output_modality: params.output_modality, prompt: params.prompt, session_id: params.session_id, voice: params.voice, diff --git a/codex-rs/tui/src/chatwidget/realtime.rs b/codex-rs/tui/src/chatwidget/realtime.rs index 88c2eab9535..e6c6ed49bee 100644 --- a/codex-rs/tui/src/chatwidget/realtime.rs +++ b/codex-rs/tui/src/chatwidget/realtime.rs @@ -3,11 +3,11 @@ use codex_config::config_toml::RealtimeTransport; use codex_protocol::protocol::ConversationStartParams; use codex_protocol::protocol::ConversationStartTransport; use codex_protocol::protocol::RealtimeAudioFrame; -use codex_protocol::protocol::RealtimeConnection; use codex_protocol::protocol::RealtimeConversationClosedEvent; use codex_protocol::protocol::RealtimeConversationRealtimeEvent; use codex_protocol::protocol::RealtimeConversationStartedEvent; use codex_protocol::protocol::RealtimeEvent; +use codex_protocol::protocol::RealtimeOutputModality; use codex_realtime_webrtc::RealtimeWebrtcEvent; use codex_realtime_webrtc::RealtimeWebrtcSession; use codex_realtime_webrtc::RealtimeWebrtcSessionHandle; @@ -237,7 +237,7 @@ impl ChatWidget { ) { self.submit_op(AppCommand::realtime_conversation_start( ConversationStartParams { - connection: RealtimeConnection::Audio, + output_modality: RealtimeOutputModality::Audio, prompt: None, session_id: None, transport, From 001cdd274ec8567d9cbce4fb736128def9755a65 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Mon, 13 Apr 2026 16:04:12 -0700 Subject: [PATCH 04/17] Split realtime transcript notifications --- .../schema/json/ServerNotification.json | 68 ++++++++---- .../codex_app_server_protocol.schemas.json | 72 ++++++++---- .../codex_app_server_protocol.v2.schemas.json | 72 ++++++++---- ...adRealtimeTranscriptDeltaNotification.json | 23 ++++ ...eadRealtimeTranscriptDoneNotification.json | 23 ++++ ...RealtimeTranscriptUpdatedNotification.json | 36 ------ .../RealtimeTranscriptUpdateKind.ts | 5 - .../schema/typescript/ServerNotification.ts | 5 +- .../schema/typescript/index.ts | 1 - ...readRealtimeTranscriptDeltaNotification.ts | 13 +++ ...hreadRealtimeTranscriptDoneNotification.ts | 13 +++ ...adRealtimeTranscriptUpdatedNotification.ts | 14 --- .../schema/typescript/v2/index.ts | 3 +- .../src/protocol/common.rs | 6 +- .../app-server-protocol/src/protocol/v2.rs | 22 +++- codex-rs/app-server/README.md | 3 +- .../app-server/src/bespoke_event_handling.rs | 41 +++++-- .../tests/suite/v2/realtime_conversation.rs | 105 ++++++++---------- codex-rs/codex-api/src/endpoint/mod.rs | 1 - .../endpoint/realtime_websocket/methods.rs | 47 +++----- .../src/endpoint/realtime_websocket/mod.rs | 1 - .../endpoint/realtime_websocket/protocol.rs | 2 - .../realtime_websocket/protocol_common.rs | 15 ++- .../realtime_websocket/protocol_v2.rs | 14 +-- codex-rs/codex-api/src/lib.rs | 1 - codex-rs/core/src/realtime_conversation.rs | 2 + codex-rs/protocol/src/protocol.rs | 15 ++- codex-rs/tui/src/app/app_server_adapter.rs | 5 +- codex-rs/tui/src/chatwidget.rs | 3 +- codex-rs/tui/src/chatwidget/realtime.rs | 2 + 30 files changed, 378 insertions(+), 255 deletions(-) create mode 100644 codex-rs/app-server-protocol/schema/json/v2/ThreadRealtimeTranscriptDeltaNotification.json create mode 100644 codex-rs/app-server-protocol/schema/json/v2/ThreadRealtimeTranscriptDoneNotification.json delete mode 100644 codex-rs/app-server-protocol/schema/json/v2/ThreadRealtimeTranscriptUpdatedNotification.json delete mode 100644 codex-rs/app-server-protocol/schema/typescript/RealtimeTranscriptUpdateKind.ts create mode 100644 codex-rs/app-server-protocol/schema/typescript/v2/ThreadRealtimeTranscriptDeltaNotification.ts create mode 100644 codex-rs/app-server-protocol/schema/typescript/v2/ThreadRealtimeTranscriptDoneNotification.ts delete mode 100644 codex-rs/app-server-protocol/schema/typescript/v2/ThreadRealtimeTranscriptUpdatedNotification.ts diff --git a/codex-rs/app-server-protocol/schema/json/ServerNotification.json b/codex-rs/app-server-protocol/schema/json/ServerNotification.json index df92b988ea7..4edff15748a 100644 --- a/codex-rs/app-server-protocol/schema/json/ServerNotification.json +++ b/codex-rs/app-server-protocol/schema/json/ServerNotification.json @@ -2111,13 +2111,6 @@ ], "type": "string" }, - "RealtimeTranscriptUpdateKind": { - "enum": [ - "delta", - "done" - ], - "type": "string" - }, "ReasoningEffort": { "description": "See https://platform.openai.com/docs/guides/reasoning?api-mode=responses#get-started-with-reasoning", "enum": [ @@ -3391,28 +3384,45 @@ ], "type": "object" }, - "ThreadRealtimeTranscriptUpdatedNotification": { - "description": "EXPERIMENTAL - flat transcript update emitted whenever realtime transcript text changes or completes.", + "ThreadRealtimeTranscriptDeltaNotification": { + "description": "EXPERIMENTAL - flat transcript delta emitted whenever realtime transcript text changes.", + "properties": { + "delta": { + "description": "Live transcript delta from the realtime event.", + "type": "string" + }, + "role": { + "type": "string" + }, + "threadId": { + "type": "string" + } + }, + "required": [ + "delta", + "role", + "threadId" + ], + "type": "object" + }, + "ThreadRealtimeTranscriptDoneNotification": { + "description": "EXPERIMENTAL - final transcript text emitted when realtime completes a transcript part.", "properties": { "role": { "type": "string" }, "text": { - "description": "Delta text for delta updates; final complete text for done updates.", + "description": "Final complete text for the transcript part.", "type": "string" }, "threadId": { "type": "string" - }, - "updateKind": { - "$ref": "#/definitions/RealtimeTranscriptUpdateKind" } }, "required": [ "role", "text", - "threadId", - "updateKind" + "threadId" ], "type": "object" }, @@ -4961,20 +4971,40 @@ "properties": { "method": { "enum": [ - "thread/realtime/transcriptUpdated" + "thread/realtime/transcript/delta" + ], + "title": "Thread/realtime/transcript/deltaNotificationMethod", + "type": "string" + }, + "params": { + "$ref": "#/definitions/ThreadRealtimeTranscriptDeltaNotification" + } + }, + "required": [ + "method", + "params" + ], + "title": "Thread/realtime/transcript/deltaNotification", + "type": "object" + }, + { + "properties": { + "method": { + "enum": [ + "thread/realtime/transcript/done" ], - "title": "Thread/realtime/transcriptUpdatedNotificationMethod", + "title": "Thread/realtime/transcript/doneNotificationMethod", "type": "string" }, "params": { - "$ref": "#/definitions/ThreadRealtimeTranscriptUpdatedNotification" + "$ref": "#/definitions/ThreadRealtimeTranscriptDoneNotification" } }, "required": [ "method", "params" ], - "title": "Thread/realtime/transcriptUpdatedNotification", + "title": "Thread/realtime/transcript/doneNotification", "type": "object" }, { diff --git a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json index 7446569e2cf..c7123429d0c 100644 --- a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json +++ b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.schemas.json @@ -4330,20 +4330,40 @@ "properties": { "method": { "enum": [ - "thread/realtime/transcriptUpdated" + "thread/realtime/transcript/delta" ], - "title": "Thread/realtime/transcriptUpdatedNotificationMethod", + "title": "Thread/realtime/transcript/deltaNotificationMethod", "type": "string" }, "params": { - "$ref": "#/definitions/v2/ThreadRealtimeTranscriptUpdatedNotification" + "$ref": "#/definitions/v2/ThreadRealtimeTranscriptDeltaNotification" } }, "required": [ "method", "params" ], - "title": "Thread/realtime/transcriptUpdatedNotification", + "title": "Thread/realtime/transcript/deltaNotification", + "type": "object" + }, + { + "properties": { + "method": { + "enum": [ + "thread/realtime/transcript/done" + ], + "title": "Thread/realtime/transcript/doneNotificationMethod", + "type": "string" + }, + "params": { + "$ref": "#/definitions/v2/ThreadRealtimeTranscriptDoneNotification" + } + }, + "required": [ + "method", + "params" + ], + "title": "Thread/realtime/transcript/doneNotification", "type": "object" }, { @@ -10622,13 +10642,6 @@ ], "type": "string" }, - "RealtimeTranscriptUpdateKind": { - "enum": [ - "delta", - "done" - ], - "type": "string" - }, "RealtimeVoice": { "enum": [ "alloy", @@ -13975,31 +13988,50 @@ "title": "ThreadRealtimeStartedNotification", "type": "object" }, - "ThreadRealtimeTranscriptUpdatedNotification": { + "ThreadRealtimeTranscriptDeltaNotification": { "$schema": "http://json-schema.org/draft-07/schema#", - "description": "EXPERIMENTAL - flat transcript update emitted whenever realtime transcript text changes or completes.", + "description": "EXPERIMENTAL - flat transcript delta emitted whenever realtime transcript text changes.", + "properties": { + "delta": { + "description": "Live transcript delta from the realtime event.", + "type": "string" + }, + "role": { + "type": "string" + }, + "threadId": { + "type": "string" + } + }, + "required": [ + "delta", + "role", + "threadId" + ], + "title": "ThreadRealtimeTranscriptDeltaNotification", + "type": "object" + }, + "ThreadRealtimeTranscriptDoneNotification": { + "$schema": "http://json-schema.org/draft-07/schema#", + "description": "EXPERIMENTAL - final transcript text emitted when realtime completes a transcript part.", "properties": { "role": { "type": "string" }, "text": { - "description": "Delta text for delta updates; final complete text for done updates.", + "description": "Final complete text for the transcript part.", "type": "string" }, "threadId": { "type": "string" - }, - "updateKind": { - "$ref": "#/definitions/v2/RealtimeTranscriptUpdateKind" } }, "required": [ "role", "text", - "threadId", - "updateKind" + "threadId" ], - "title": "ThreadRealtimeTranscriptUpdatedNotification", + "title": "ThreadRealtimeTranscriptDoneNotification", "type": "object" }, "ThreadResumeParams": { diff --git a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json index 139125006d3..e2ce7d7f691 100644 --- a/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json +++ b/codex-rs/app-server-protocol/schema/json/codex_app_server_protocol.v2.schemas.json @@ -7418,13 +7418,6 @@ ], "type": "string" }, - "RealtimeTranscriptUpdateKind": { - "enum": [ - "delta", - "done" - ], - "type": "string" - }, "RealtimeVoice": { "enum": [ "alloy", @@ -9594,20 +9587,40 @@ "properties": { "method": { "enum": [ - "thread/realtime/transcriptUpdated" + "thread/realtime/transcript/delta" + ], + "title": "Thread/realtime/transcript/deltaNotificationMethod", + "type": "string" + }, + "params": { + "$ref": "#/definitions/ThreadRealtimeTranscriptDeltaNotification" + } + }, + "required": [ + "method", + "params" + ], + "title": "Thread/realtime/transcript/deltaNotification", + "type": "object" + }, + { + "properties": { + "method": { + "enum": [ + "thread/realtime/transcript/done" ], - "title": "Thread/realtime/transcriptUpdatedNotificationMethod", + "title": "Thread/realtime/transcript/doneNotificationMethod", "type": "string" }, "params": { - "$ref": "#/definitions/ThreadRealtimeTranscriptUpdatedNotification" + "$ref": "#/definitions/ThreadRealtimeTranscriptDoneNotification" } }, "required": [ "method", "params" ], - "title": "Thread/realtime/transcriptUpdatedNotification", + "title": "Thread/realtime/transcript/doneNotification", "type": "object" }, { @@ -11823,31 +11836,50 @@ "title": "ThreadRealtimeStartedNotification", "type": "object" }, - "ThreadRealtimeTranscriptUpdatedNotification": { + "ThreadRealtimeTranscriptDeltaNotification": { "$schema": "http://json-schema.org/draft-07/schema#", - "description": "EXPERIMENTAL - flat transcript update emitted whenever realtime transcript text changes or completes.", + "description": "EXPERIMENTAL - flat transcript delta emitted whenever realtime transcript text changes.", + "properties": { + "delta": { + "description": "Live transcript delta from the realtime event.", + "type": "string" + }, + "role": { + "type": "string" + }, + "threadId": { + "type": "string" + } + }, + "required": [ + "delta", + "role", + "threadId" + ], + "title": "ThreadRealtimeTranscriptDeltaNotification", + "type": "object" + }, + "ThreadRealtimeTranscriptDoneNotification": { + "$schema": "http://json-schema.org/draft-07/schema#", + "description": "EXPERIMENTAL - final transcript text emitted when realtime completes a transcript part.", "properties": { "role": { "type": "string" }, "text": { - "description": "Delta text for delta updates; final complete text for done updates.", + "description": "Final complete text for the transcript part.", "type": "string" }, "threadId": { "type": "string" - }, - "updateKind": { - "$ref": "#/definitions/RealtimeTranscriptUpdateKind" } }, "required": [ "role", "text", - "threadId", - "updateKind" + "threadId" ], - "title": "ThreadRealtimeTranscriptUpdatedNotification", + "title": "ThreadRealtimeTranscriptDoneNotification", "type": "object" }, "ThreadResumeParams": { diff --git a/codex-rs/app-server-protocol/schema/json/v2/ThreadRealtimeTranscriptDeltaNotification.json b/codex-rs/app-server-protocol/schema/json/v2/ThreadRealtimeTranscriptDeltaNotification.json new file mode 100644 index 00000000000..22ad778eb2a --- /dev/null +++ b/codex-rs/app-server-protocol/schema/json/v2/ThreadRealtimeTranscriptDeltaNotification.json @@ -0,0 +1,23 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "description": "EXPERIMENTAL - flat transcript delta emitted whenever realtime transcript text changes.", + "properties": { + "delta": { + "description": "Live transcript delta from the realtime event.", + "type": "string" + }, + "role": { + "type": "string" + }, + "threadId": { + "type": "string" + } + }, + "required": [ + "delta", + "role", + "threadId" + ], + "title": "ThreadRealtimeTranscriptDeltaNotification", + "type": "object" +} \ No newline at end of file diff --git a/codex-rs/app-server-protocol/schema/json/v2/ThreadRealtimeTranscriptDoneNotification.json b/codex-rs/app-server-protocol/schema/json/v2/ThreadRealtimeTranscriptDoneNotification.json new file mode 100644 index 00000000000..2f4199fdb9e --- /dev/null +++ b/codex-rs/app-server-protocol/schema/json/v2/ThreadRealtimeTranscriptDoneNotification.json @@ -0,0 +1,23 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "description": "EXPERIMENTAL - final transcript text emitted when realtime completes a transcript part.", + "properties": { + "role": { + "type": "string" + }, + "text": { + "description": "Final complete text for the transcript part.", + "type": "string" + }, + "threadId": { + "type": "string" + } + }, + "required": [ + "role", + "text", + "threadId" + ], + "title": "ThreadRealtimeTranscriptDoneNotification", + "type": "object" +} \ No newline at end of file diff --git a/codex-rs/app-server-protocol/schema/json/v2/ThreadRealtimeTranscriptUpdatedNotification.json b/codex-rs/app-server-protocol/schema/json/v2/ThreadRealtimeTranscriptUpdatedNotification.json deleted file mode 100644 index f936e581720..00000000000 --- a/codex-rs/app-server-protocol/schema/json/v2/ThreadRealtimeTranscriptUpdatedNotification.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "definitions": { - "RealtimeTranscriptUpdateKind": { - "enum": [ - "delta", - "done" - ], - "type": "string" - } - }, - "description": "EXPERIMENTAL - flat transcript update emitted whenever realtime transcript text changes or completes.", - "properties": { - "role": { - "type": "string" - }, - "text": { - "description": "Delta text for delta updates; final complete text for done updates.", - "type": "string" - }, - "threadId": { - "type": "string" - }, - "updateKind": { - "$ref": "#/definitions/RealtimeTranscriptUpdateKind" - } - }, - "required": [ - "role", - "text", - "threadId", - "updateKind" - ], - "title": "ThreadRealtimeTranscriptUpdatedNotification", - "type": "object" -} \ No newline at end of file diff --git a/codex-rs/app-server-protocol/schema/typescript/RealtimeTranscriptUpdateKind.ts b/codex-rs/app-server-protocol/schema/typescript/RealtimeTranscriptUpdateKind.ts deleted file mode 100644 index 1bc6d6ed8e7..00000000000 --- a/codex-rs/app-server-protocol/schema/typescript/RealtimeTranscriptUpdateKind.ts +++ /dev/null @@ -1,5 +0,0 @@ -// GENERATED CODE! DO NOT MODIFY BY HAND! - -// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually. - -export type RealtimeTranscriptUpdateKind = "delta" | "done"; diff --git a/codex-rs/app-server-protocol/schema/typescript/ServerNotification.ts b/codex-rs/app-server-protocol/schema/typescript/ServerNotification.ts index a9859141342..1db7027febf 100644 --- a/codex-rs/app-server-protocol/schema/typescript/ServerNotification.ts +++ b/codex-rs/app-server-protocol/schema/typescript/ServerNotification.ts @@ -43,7 +43,8 @@ import type { ThreadRealtimeItemAddedNotification } from "./v2/ThreadRealtimeIte import type { ThreadRealtimeOutputAudioDeltaNotification } from "./v2/ThreadRealtimeOutputAudioDeltaNotification"; import type { ThreadRealtimeSdpNotification } from "./v2/ThreadRealtimeSdpNotification"; import type { ThreadRealtimeStartedNotification } from "./v2/ThreadRealtimeStartedNotification"; -import type { ThreadRealtimeTranscriptUpdatedNotification } from "./v2/ThreadRealtimeTranscriptUpdatedNotification"; +import type { ThreadRealtimeTranscriptDeltaNotification } from "./v2/ThreadRealtimeTranscriptDeltaNotification"; +import type { ThreadRealtimeTranscriptDoneNotification } from "./v2/ThreadRealtimeTranscriptDoneNotification"; import type { ThreadStartedNotification } from "./v2/ThreadStartedNotification"; import type { ThreadStatusChangedNotification } from "./v2/ThreadStatusChangedNotification"; import type { ThreadTokenUsageUpdatedNotification } from "./v2/ThreadTokenUsageUpdatedNotification"; @@ -58,4 +59,4 @@ import type { WindowsWorldWritableWarningNotification } from "./v2/WindowsWorldW /** * Notification sent from the server to the client. */ -export type ServerNotification = { "method": "error", "params": ErrorNotification } | { "method": "thread/started", "params": ThreadStartedNotification } | { "method": "thread/status/changed", "params": ThreadStatusChangedNotification } | { "method": "thread/archived", "params": ThreadArchivedNotification } | { "method": "thread/unarchived", "params": ThreadUnarchivedNotification } | { "method": "thread/closed", "params": ThreadClosedNotification } | { "method": "skills/changed", "params": SkillsChangedNotification } | { "method": "thread/name/updated", "params": ThreadNameUpdatedNotification } | { "method": "thread/tokenUsage/updated", "params": ThreadTokenUsageUpdatedNotification } | { "method": "turn/started", "params": TurnStartedNotification } | { "method": "hook/started", "params": HookStartedNotification } | { "method": "turn/completed", "params": TurnCompletedNotification } | { "method": "hook/completed", "params": HookCompletedNotification } | { "method": "turn/diff/updated", "params": TurnDiffUpdatedNotification } | { "method": "turn/plan/updated", "params": TurnPlanUpdatedNotification } | { "method": "item/started", "params": ItemStartedNotification } | { "method": "item/autoApprovalReview/started", "params": ItemGuardianApprovalReviewStartedNotification } | { "method": "item/autoApprovalReview/completed", "params": ItemGuardianApprovalReviewCompletedNotification } | { "method": "item/completed", "params": ItemCompletedNotification } | { "method": "rawResponseItem/completed", "params": RawResponseItemCompletedNotification } | { "method": "item/agentMessage/delta", "params": AgentMessageDeltaNotification } | { "method": "item/plan/delta", "params": PlanDeltaNotification } | { "method": "command/exec/outputDelta", "params": CommandExecOutputDeltaNotification } | { "method": "item/commandExecution/outputDelta", "params": CommandExecutionOutputDeltaNotification } | { "method": "item/commandExecution/terminalInteraction", "params": TerminalInteractionNotification } | { "method": "item/fileChange/outputDelta", "params": FileChangeOutputDeltaNotification } | { "method": "serverRequest/resolved", "params": ServerRequestResolvedNotification } | { "method": "item/mcpToolCall/progress", "params": McpToolCallProgressNotification } | { "method": "mcpServer/oauthLogin/completed", "params": McpServerOauthLoginCompletedNotification } | { "method": "mcpServer/startupStatus/updated", "params": McpServerStatusUpdatedNotification } | { "method": "account/updated", "params": AccountUpdatedNotification } | { "method": "account/rateLimits/updated", "params": AccountRateLimitsUpdatedNotification } | { "method": "app/list/updated", "params": AppListUpdatedNotification } | { "method": "fs/changed", "params": FsChangedNotification } | { "method": "item/reasoning/summaryTextDelta", "params": ReasoningSummaryTextDeltaNotification } | { "method": "item/reasoning/summaryPartAdded", "params": ReasoningSummaryPartAddedNotification } | { "method": "item/reasoning/textDelta", "params": ReasoningTextDeltaNotification } | { "method": "thread/compacted", "params": ContextCompactedNotification } | { "method": "model/rerouted", "params": ModelReroutedNotification } | { "method": "deprecationNotice", "params": DeprecationNoticeNotification } | { "method": "configWarning", "params": ConfigWarningNotification } | { "method": "fuzzyFileSearch/sessionUpdated", "params": FuzzyFileSearchSessionUpdatedNotification } | { "method": "fuzzyFileSearch/sessionCompleted", "params": FuzzyFileSearchSessionCompletedNotification } | { "method": "thread/realtime/started", "params": ThreadRealtimeStartedNotification } | { "method": "thread/realtime/itemAdded", "params": ThreadRealtimeItemAddedNotification } | { "method": "thread/realtime/transcriptUpdated", "params": ThreadRealtimeTranscriptUpdatedNotification } | { "method": "thread/realtime/outputAudio/delta", "params": ThreadRealtimeOutputAudioDeltaNotification } | { "method": "thread/realtime/sdp", "params": ThreadRealtimeSdpNotification } | { "method": "thread/realtime/error", "params": ThreadRealtimeErrorNotification } | { "method": "thread/realtime/closed", "params": ThreadRealtimeClosedNotification } | { "method": "windows/worldWritableWarning", "params": WindowsWorldWritableWarningNotification } | { "method": "windowsSandbox/setupCompleted", "params": WindowsSandboxSetupCompletedNotification } | { "method": "account/login/completed", "params": AccountLoginCompletedNotification }; +export type ServerNotification = { "method": "error", "params": ErrorNotification } | { "method": "thread/started", "params": ThreadStartedNotification } | { "method": "thread/status/changed", "params": ThreadStatusChangedNotification } | { "method": "thread/archived", "params": ThreadArchivedNotification } | { "method": "thread/unarchived", "params": ThreadUnarchivedNotification } | { "method": "thread/closed", "params": ThreadClosedNotification } | { "method": "skills/changed", "params": SkillsChangedNotification } | { "method": "thread/name/updated", "params": ThreadNameUpdatedNotification } | { "method": "thread/tokenUsage/updated", "params": ThreadTokenUsageUpdatedNotification } | { "method": "turn/started", "params": TurnStartedNotification } | { "method": "hook/started", "params": HookStartedNotification } | { "method": "turn/completed", "params": TurnCompletedNotification } | { "method": "hook/completed", "params": HookCompletedNotification } | { "method": "turn/diff/updated", "params": TurnDiffUpdatedNotification } | { "method": "turn/plan/updated", "params": TurnPlanUpdatedNotification } | { "method": "item/started", "params": ItemStartedNotification } | { "method": "item/autoApprovalReview/started", "params": ItemGuardianApprovalReviewStartedNotification } | { "method": "item/autoApprovalReview/completed", "params": ItemGuardianApprovalReviewCompletedNotification } | { "method": "item/completed", "params": ItemCompletedNotification } | { "method": "rawResponseItem/completed", "params": RawResponseItemCompletedNotification } | { "method": "item/agentMessage/delta", "params": AgentMessageDeltaNotification } | { "method": "item/plan/delta", "params": PlanDeltaNotification } | { "method": "command/exec/outputDelta", "params": CommandExecOutputDeltaNotification } | { "method": "item/commandExecution/outputDelta", "params": CommandExecutionOutputDeltaNotification } | { "method": "item/commandExecution/terminalInteraction", "params": TerminalInteractionNotification } | { "method": "item/fileChange/outputDelta", "params": FileChangeOutputDeltaNotification } | { "method": "serverRequest/resolved", "params": ServerRequestResolvedNotification } | { "method": "item/mcpToolCall/progress", "params": McpToolCallProgressNotification } | { "method": "mcpServer/oauthLogin/completed", "params": McpServerOauthLoginCompletedNotification } | { "method": "mcpServer/startupStatus/updated", "params": McpServerStatusUpdatedNotification } | { "method": "account/updated", "params": AccountUpdatedNotification } | { "method": "account/rateLimits/updated", "params": AccountRateLimitsUpdatedNotification } | { "method": "app/list/updated", "params": AppListUpdatedNotification } | { "method": "fs/changed", "params": FsChangedNotification } | { "method": "item/reasoning/summaryTextDelta", "params": ReasoningSummaryTextDeltaNotification } | { "method": "item/reasoning/summaryPartAdded", "params": ReasoningSummaryPartAddedNotification } | { "method": "item/reasoning/textDelta", "params": ReasoningTextDeltaNotification } | { "method": "thread/compacted", "params": ContextCompactedNotification } | { "method": "model/rerouted", "params": ModelReroutedNotification } | { "method": "deprecationNotice", "params": DeprecationNoticeNotification } | { "method": "configWarning", "params": ConfigWarningNotification } | { "method": "fuzzyFileSearch/sessionUpdated", "params": FuzzyFileSearchSessionUpdatedNotification } | { "method": "fuzzyFileSearch/sessionCompleted", "params": FuzzyFileSearchSessionCompletedNotification } | { "method": "thread/realtime/started", "params": ThreadRealtimeStartedNotification } | { "method": "thread/realtime/itemAdded", "params": ThreadRealtimeItemAddedNotification } | { "method": "thread/realtime/transcript/delta", "params": ThreadRealtimeTranscriptDeltaNotification } | { "method": "thread/realtime/transcript/done", "params": ThreadRealtimeTranscriptDoneNotification } | { "method": "thread/realtime/outputAudio/delta", "params": ThreadRealtimeOutputAudioDeltaNotification } | { "method": "thread/realtime/sdp", "params": ThreadRealtimeSdpNotification } | { "method": "thread/realtime/error", "params": ThreadRealtimeErrorNotification } | { "method": "thread/realtime/closed", "params": ThreadRealtimeClosedNotification } | { "method": "windows/worldWritableWarning", "params": WindowsWorldWritableWarningNotification } | { "method": "windowsSandbox/setupCompleted", "params": WindowsSandboxSetupCompletedNotification } | { "method": "account/login/completed", "params": AccountLoginCompletedNotification }; diff --git a/codex-rs/app-server-protocol/schema/typescript/index.ts b/codex-rs/app-server-protocol/schema/typescript/index.ts index fe9c81917bd..7bbb417fdc9 100644 --- a/codex-rs/app-server-protocol/schema/typescript/index.ts +++ b/codex-rs/app-server-protocol/schema/typescript/index.ts @@ -50,7 +50,6 @@ export type { Personality } from "./Personality"; export type { PlanType } from "./PlanType"; export type { RealtimeConversationVersion } from "./RealtimeConversationVersion"; export type { RealtimeOutputModality } from "./RealtimeOutputModality"; -export type { RealtimeTranscriptUpdateKind } from "./RealtimeTranscriptUpdateKind"; export type { RealtimeVoice } from "./RealtimeVoice"; export type { RealtimeVoicesList } from "./RealtimeVoicesList"; export type { ReasoningEffort } from "./ReasoningEffort"; diff --git a/codex-rs/app-server-protocol/schema/typescript/v2/ThreadRealtimeTranscriptDeltaNotification.ts b/codex-rs/app-server-protocol/schema/typescript/v2/ThreadRealtimeTranscriptDeltaNotification.ts new file mode 100644 index 00000000000..805eeddd768 --- /dev/null +++ b/codex-rs/app-server-protocol/schema/typescript/v2/ThreadRealtimeTranscriptDeltaNotification.ts @@ -0,0 +1,13 @@ +// GENERATED CODE! DO NOT MODIFY BY HAND! + +// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually. + +/** + * EXPERIMENTAL - flat transcript delta emitted whenever realtime + * transcript text changes. + */ +export type ThreadRealtimeTranscriptDeltaNotification = { threadId: string, role: string, +/** + * Live transcript delta from the realtime event. + */ +delta: string, }; diff --git a/codex-rs/app-server-protocol/schema/typescript/v2/ThreadRealtimeTranscriptDoneNotification.ts b/codex-rs/app-server-protocol/schema/typescript/v2/ThreadRealtimeTranscriptDoneNotification.ts new file mode 100644 index 00000000000..d4667ad039f --- /dev/null +++ b/codex-rs/app-server-protocol/schema/typescript/v2/ThreadRealtimeTranscriptDoneNotification.ts @@ -0,0 +1,13 @@ +// GENERATED CODE! DO NOT MODIFY BY HAND! + +// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually. + +/** + * EXPERIMENTAL - final transcript text emitted when realtime completes + * a transcript part. + */ +export type ThreadRealtimeTranscriptDoneNotification = { threadId: string, role: string, +/** + * Final complete text for the transcript part. + */ +text: string, }; diff --git a/codex-rs/app-server-protocol/schema/typescript/v2/ThreadRealtimeTranscriptUpdatedNotification.ts b/codex-rs/app-server-protocol/schema/typescript/v2/ThreadRealtimeTranscriptUpdatedNotification.ts deleted file mode 100644 index bf1b1e3439d..00000000000 --- a/codex-rs/app-server-protocol/schema/typescript/v2/ThreadRealtimeTranscriptUpdatedNotification.ts +++ /dev/null @@ -1,14 +0,0 @@ -// GENERATED CODE! DO NOT MODIFY BY HAND! - -// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually. -import type { RealtimeTranscriptUpdateKind } from "../RealtimeTranscriptUpdateKind"; - -/** - * EXPERIMENTAL - flat transcript update emitted whenever realtime - * transcript text changes or completes. - */ -export type ThreadRealtimeTranscriptUpdatedNotification = { threadId: string, role: string, -/** - * Delta text for delta updates; final complete text for done updates. - */ -text: string, updateKind: RealtimeTranscriptUpdateKind, }; diff --git a/codex-rs/app-server-protocol/schema/typescript/v2/index.ts b/codex-rs/app-server-protocol/schema/typescript/v2/index.ts index f815fee3e9d..16294d28aa7 100644 --- a/codex-rs/app-server-protocol/schema/typescript/v2/index.ts +++ b/codex-rs/app-server-protocol/schema/typescript/v2/index.ts @@ -301,7 +301,8 @@ export type { ThreadRealtimeOutputAudioDeltaNotification } from "./ThreadRealtim export type { ThreadRealtimeSdpNotification } from "./ThreadRealtimeSdpNotification"; export type { ThreadRealtimeStartTransport } from "./ThreadRealtimeStartTransport"; export type { ThreadRealtimeStartedNotification } from "./ThreadRealtimeStartedNotification"; -export type { ThreadRealtimeTranscriptUpdatedNotification } from "./ThreadRealtimeTranscriptUpdatedNotification"; +export type { ThreadRealtimeTranscriptDeltaNotification } from "./ThreadRealtimeTranscriptDeltaNotification"; +export type { ThreadRealtimeTranscriptDoneNotification } from "./ThreadRealtimeTranscriptDoneNotification"; export type { ThreadResumeParams } from "./ThreadResumeParams"; export type { ThreadResumeResponse } from "./ThreadResumeResponse"; export type { ThreadRollbackParams } from "./ThreadRollbackParams"; diff --git a/codex-rs/app-server-protocol/src/protocol/common.rs b/codex-rs/app-server-protocol/src/protocol/common.rs index 3846d5984f2..8d318d42717 100644 --- a/codex-rs/app-server-protocol/src/protocol/common.rs +++ b/codex-rs/app-server-protocol/src/protocol/common.rs @@ -1017,8 +1017,10 @@ server_notification_definitions! { ThreadRealtimeStarted => "thread/realtime/started" (v2::ThreadRealtimeStartedNotification), #[experimental("thread/realtime/itemAdded")] ThreadRealtimeItemAdded => "thread/realtime/itemAdded" (v2::ThreadRealtimeItemAddedNotification), - #[experimental("thread/realtime/transcriptUpdated")] - ThreadRealtimeTranscriptUpdated => "thread/realtime/transcriptUpdated" (v2::ThreadRealtimeTranscriptUpdatedNotification), + #[experimental("thread/realtime/transcript/delta")] + ThreadRealtimeTranscriptDelta => "thread/realtime/transcript/delta" (v2::ThreadRealtimeTranscriptDeltaNotification), + #[experimental("thread/realtime/transcript/done")] + ThreadRealtimeTranscriptDone => "thread/realtime/transcript/done" (v2::ThreadRealtimeTranscriptDoneNotification), #[experimental("thread/realtime/outputAudio/delta")] ThreadRealtimeOutputAudioDelta => "thread/realtime/outputAudio/delta" (v2::ThreadRealtimeOutputAudioDeltaNotification), #[experimental("thread/realtime/sdp")] diff --git a/codex-rs/app-server-protocol/src/protocol/v2.rs b/codex-rs/app-server-protocol/src/protocol/v2.rs index 2c91b7d728b..8a12a2a3733 100644 --- a/codex-rs/app-server-protocol/src/protocol/v2.rs +++ b/codex-rs/app-server-protocol/src/protocol/v2.rs @@ -75,7 +75,6 @@ use codex_protocol::protocol::ReadOnlyAccess as CoreReadOnlyAccess; use codex_protocol::protocol::RealtimeAudioFrame as CoreRealtimeAudioFrame; use codex_protocol::protocol::RealtimeConversationVersion; use codex_protocol::protocol::RealtimeOutputModality; -use codex_protocol::protocol::RealtimeTranscriptUpdateKind; use codex_protocol::protocol::RealtimeVoice; use codex_protocol::protocol::RealtimeVoicesList; use codex_protocol::protocol::ReviewDecision as CoreReviewDecision; @@ -4076,17 +4075,28 @@ pub struct ThreadRealtimeItemAddedNotification { pub item: JsonValue, } -/// EXPERIMENTAL - flat transcript update emitted whenever realtime -/// transcript text changes or completes. +/// EXPERIMENTAL - flat transcript delta emitted whenever realtime +/// transcript text changes. #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, JsonSchema, TS)] #[serde(rename_all = "camelCase")] #[ts(export_to = "v2/")] -pub struct ThreadRealtimeTranscriptUpdatedNotification { +pub struct ThreadRealtimeTranscriptDeltaNotification { pub thread_id: String, pub role: String, - /// Delta text for delta updates; final complete text for done updates. + /// Live transcript delta from the realtime event. + pub delta: String, +} + +/// EXPERIMENTAL - final transcript text emitted when realtime completes +/// a transcript part. +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, JsonSchema, TS)] +#[serde(rename_all = "camelCase")] +#[ts(export_to = "v2/")] +pub struct ThreadRealtimeTranscriptDoneNotification { + pub thread_id: String, + pub role: String, + /// Final complete text for the transcript part. pub text: String, - pub update_kind: RealtimeTranscriptUpdateKind, } /// EXPERIMENTAL - streamed output audio emitted by thread realtime. diff --git a/codex-rs/app-server/README.md b/codex-rs/app-server/README.md index cf820e28aa4..87c8adc72f2 100644 --- a/codex-rs/app-server/README.md +++ b/codex-rs/app-server/README.md @@ -932,7 +932,8 @@ The thread realtime API emits thread-scoped notifications for session lifecycle - `thread/realtime/started` — `{ threadId, sessionId }` once realtime starts for the thread (experimental). - `thread/realtime/itemAdded` — `{ threadId, item }` for raw non-audio realtime items that do not have a dedicated typed app-server notification, including `handoff_request` (experimental). `item` is forwarded as raw JSON while the upstream websocket item schema remains unstable. -- `thread/realtime/transcriptUpdated` — `{ threadId, role, text, updateKind }` whenever realtime transcript text changes (experimental). `updateKind: "delta"` forwards the live transcript delta, and `updateKind: "done"` forwards the final full text for that output part. +- `thread/realtime/transcript/delta` — `{ threadId, role, delta }` for live realtime transcript deltas (experimental). +- `thread/realtime/transcript/done` — `{ threadId, role, text }` when realtime emits the final full text for a transcript part (experimental). - `thread/realtime/outputAudio/delta` — `{ threadId, audio }` for streamed output audio chunks (experimental). `audio` uses camelCase fields (`data`, `sampleRate`, `numChannels`, `samplesPerChannel`). - `thread/realtime/error` — `{ threadId, message }` when realtime encounters a transport or backend error (experimental). - `thread/realtime/closed` — `{ threadId, reason }` when the realtime transport closes (experimental). diff --git a/codex-rs/app-server/src/bespoke_event_handling.rs b/codex-rs/app-server/src/bespoke_event_handling.rs index 20c8b887ad9..b1202bda2ca 100644 --- a/codex-rs/app-server/src/bespoke_event_handling.rs +++ b/codex-rs/app-server/src/bespoke_event_handling.rs @@ -82,7 +82,8 @@ use codex_app_server_protocol::ThreadRealtimeItemAddedNotification; use codex_app_server_protocol::ThreadRealtimeOutputAudioDeltaNotification; use codex_app_server_protocol::ThreadRealtimeSdpNotification; use codex_app_server_protocol::ThreadRealtimeStartedNotification; -use codex_app_server_protocol::ThreadRealtimeTranscriptUpdatedNotification; +use codex_app_server_protocol::ThreadRealtimeTranscriptDeltaNotification; +use codex_app_server_protocol::ThreadRealtimeTranscriptDoneNotification; use codex_app_server_protocol::ThreadRollbackResponse; use codex_app_server_protocol::ThreadTokenUsage; use codex_app_server_protocol::ThreadTokenUsageUpdatedNotification; @@ -401,28 +402,50 @@ pub(crate) async fn apply_bespoke_event_handling( .await; } RealtimeEvent::InputTranscriptDelta(event) => { - let notification = ThreadRealtimeTranscriptUpdatedNotification { + let notification = ThreadRealtimeTranscriptDeltaNotification { thread_id: conversation_id.to_string(), role: "user".to_string(), - text: event.delta, - update_kind: event.update_kind, + delta: event.delta, }; outgoing .send_server_notification( - ServerNotification::ThreadRealtimeTranscriptUpdated(notification), + ServerNotification::ThreadRealtimeTranscriptDelta(notification), + ) + .await; + } + RealtimeEvent::InputTranscriptDone(event) => { + let notification = ThreadRealtimeTranscriptDoneNotification { + thread_id: conversation_id.to_string(), + role: "user".to_string(), + text: event.text, + }; + outgoing + .send_server_notification( + ServerNotification::ThreadRealtimeTranscriptDone(notification), ) .await; } RealtimeEvent::OutputTranscriptDelta(event) => { - let notification = ThreadRealtimeTranscriptUpdatedNotification { + let notification = ThreadRealtimeTranscriptDeltaNotification { + thread_id: conversation_id.to_string(), + role: "assistant".to_string(), + delta: event.delta, + }; + outgoing + .send_server_notification( + ServerNotification::ThreadRealtimeTranscriptDelta(notification), + ) + .await; + } + RealtimeEvent::OutputTranscriptDone(event) => { + let notification = ThreadRealtimeTranscriptDoneNotification { thread_id: conversation_id.to_string(), role: "assistant".to_string(), - text: event.delta, - update_kind: event.update_kind, + text: event.text, }; outgoing .send_server_notification( - ServerNotification::ThreadRealtimeTranscriptUpdated(notification), + ServerNotification::ThreadRealtimeTranscriptDone(notification), ) .await; } diff --git a/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs b/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs index 47d17a9aa1d..66f7693f750 100644 --- a/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs +++ b/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs @@ -31,7 +31,8 @@ use codex_app_server_protocol::ThreadRealtimeStartTransport; use codex_app_server_protocol::ThreadRealtimeStartedNotification; use codex_app_server_protocol::ThreadRealtimeStopParams; use codex_app_server_protocol::ThreadRealtimeStopResponse; -use codex_app_server_protocol::ThreadRealtimeTranscriptUpdatedNotification; +use codex_app_server_protocol::ThreadRealtimeTranscriptDeltaNotification; +use codex_app_server_protocol::ThreadRealtimeTranscriptDoneNotification; use codex_app_server_protocol::ThreadStartParams; use codex_app_server_protocol::ThreadStartResponse; use codex_app_server_protocol::TurnCompletedNotification; @@ -40,7 +41,6 @@ use codex_features::FEATURES; use codex_features::Feature; use codex_protocol::protocol::RealtimeConversationVersion; use codex_protocol::protocol::RealtimeOutputModality; -use codex_protocol::protocol::RealtimeTranscriptUpdateKind; use codex_protocol::protocol::RealtimeVoice; use codex_protocol::protocol::RealtimeVoicesList; use core_test_support::responses; @@ -624,45 +624,32 @@ async fn realtime_conversation_streams_v2_notifications() -> Result<()> { assert_eq!(item_added.thread_id, output_audio.thread_id); assert_eq!(item_added.item["type"], json!("message")); - let first_transcript_update = read_notification::( + let first_transcript_delta = read_notification::( &mut mcp, - "thread/realtime/transcriptUpdated", + "thread/realtime/transcript/delta", ) .await?; - assert_eq!(first_transcript_update.thread_id, output_audio.thread_id); - assert_eq!(first_transcript_update.role, "user"); - assert_eq!(first_transcript_update.text, "delegate now"); - assert_eq!( - first_transcript_update.update_kind, - RealtimeTranscriptUpdateKind::Delta - ); + assert_eq!(first_transcript_delta.thread_id, output_audio.thread_id); + assert_eq!(first_transcript_delta.role, "user"); + assert_eq!(first_transcript_delta.delta, "delegate now"); - let second_transcript_update = - read_notification::( - &mut mcp, - "thread/realtime/transcriptUpdated", - ) - .await?; - assert_eq!(second_transcript_update.thread_id, output_audio.thread_id); - assert_eq!(second_transcript_update.role, "assistant"); - assert_eq!(second_transcript_update.text, "working"); - assert_eq!( - second_transcript_update.update_kind, - RealtimeTranscriptUpdateKind::Delta - ); + let second_transcript_delta = read_notification::( + &mut mcp, + "thread/realtime/transcript/delta", + ) + .await?; + assert_eq!(second_transcript_delta.thread_id, output_audio.thread_id); + assert_eq!(second_transcript_delta.role, "assistant"); + assert_eq!(second_transcript_delta.delta, "working"); - let final_transcript_update = read_notification::( + let final_transcript_done = read_notification::( &mut mcp, - "thread/realtime/transcriptUpdated", + "thread/realtime/transcript/done", ) .await?; - assert_eq!(final_transcript_update.thread_id, output_audio.thread_id); - assert_eq!(final_transcript_update.role, "assistant"); - assert_eq!(final_transcript_update.text, "working on it"); - assert_eq!( - final_transcript_update.update_kind, - RealtimeTranscriptUpdateKind::Done - ); + assert_eq!(final_transcript_done.thread_id, output_audio.thread_id); + assert_eq!(final_transcript_done.role, "assistant"); + assert_eq!(final_transcript_done.text, "working on it"); let handoff_item_added = read_notification::( &mut mcp, @@ -799,44 +786,44 @@ async fn realtime_text_output_modality_requests_text_output_and_final_transcript json!(["text"]) ); - let first_delta = read_notification::( + let first_delta = read_notification::( &mut mcp, - "thread/realtime/transcriptUpdated", + "thread/realtime/transcript/delta", ) .await?; - let second_delta = read_notification::( + let second_delta = read_notification::( &mut mcp, - "thread/realtime/transcriptUpdated", + "thread/realtime/transcript/delta", ) .await?; - let done = read_notification::( + let done = read_notification::( &mut mcp, - "thread/realtime/transcriptUpdated", + "thread/realtime/transcript/done", ) .await?; assert_eq!( - vec![first_delta, second_delta, done], + vec![first_delta, second_delta], vec![ - ThreadRealtimeTranscriptUpdatedNotification { + ThreadRealtimeTranscriptDeltaNotification { thread_id: thread_start.thread.id.clone(), role: "assistant".to_string(), - text: "hello ".to_string(), - update_kind: RealtimeTranscriptUpdateKind::Delta, + delta: "hello ".to_string(), }, - ThreadRealtimeTranscriptUpdatedNotification { + ThreadRealtimeTranscriptDeltaNotification { thread_id: thread_start.thread.id.clone(), role: "assistant".to_string(), - text: "world".to_string(), - update_kind: RealtimeTranscriptUpdateKind::Delta, - }, - ThreadRealtimeTranscriptUpdatedNotification { - thread_id: thread_start.thread.id, - role: "assistant".to_string(), - text: "hello world".to_string(), - update_kind: RealtimeTranscriptUpdateKind::Done, + delta: "world".to_string(), }, ] ); + assert_eq!( + done, + ThreadRealtimeTranscriptDoneNotification { + thread_id: thread_start.thread.id, + role: "assistant".to_string(), + text: "hello world".to_string(), + } + ); realtime_server.shutdown().await; Ok(()) @@ -1314,12 +1301,11 @@ async fn webrtc_v2_forwards_audio_and_text_between_client_and_sideband() -> Resu harness.append_text(thread_id, "hello").await?; let transcript = harness - .read_notification::( - "thread/realtime/transcriptUpdated", + .read_notification::( + "thread/realtime/transcript/delta", ) .await?; - assert_eq!(transcript.text, "transcribed audio"); - assert_eq!(transcript.update_kind, RealtimeTranscriptUpdateKind::Delta); + assert_eq!(transcript.delta, "transcribed audio"); let output_audio = harness .read_notification::( "thread/realtime/outputAudio/delta", @@ -1404,12 +1390,11 @@ async fn webrtc_v2_text_input_is_append_only_while_response_is_active() -> Resul "first", ); let transcript = harness - .read_notification::( - "thread/realtime/transcriptUpdated", + .read_notification::( + "thread/realtime/transcript/delta", ) .await?; - assert_eq!(transcript.text, "active response started"); - assert_eq!(transcript.update_kind, RealtimeTranscriptUpdateKind::Delta); + assert_eq!(transcript.delta, "active response started"); // Phase 3: send a second text turn while `resp_active` is still open. The // user message must reach realtime without requesting another response. diff --git a/codex-rs/codex-api/src/endpoint/mod.rs b/codex-rs/codex-api/src/endpoint/mod.rs index 515487074f2..c16687ff281 100644 --- a/codex-rs/codex-api/src/endpoint/mod.rs +++ b/codex-rs/codex-api/src/endpoint/mod.rs @@ -16,7 +16,6 @@ pub use realtime_websocket::RealtimeEventParser; pub use realtime_websocket::RealtimeOutputModality; pub use realtime_websocket::RealtimeSessionConfig; pub use realtime_websocket::RealtimeSessionMode; -pub use realtime_websocket::RealtimeTranscriptUpdateKind; pub use realtime_websocket::RealtimeWebsocketClient; pub use realtime_websocket::RealtimeWebsocketConnection; pub use realtime_websocket::RealtimeWebsocketEvents; diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs index 36bb8cecd0c..dcd330322e6 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs @@ -10,9 +10,7 @@ use crate::endpoint::realtime_websocket::protocol::RealtimeOutboundMessage; use crate::endpoint::realtime_websocket::protocol::RealtimeOutputModality; use crate::endpoint::realtime_websocket::protocol::RealtimeSessionConfig; use crate::endpoint::realtime_websocket::protocol::RealtimeSessionMode; -use crate::endpoint::realtime_websocket::protocol::RealtimeTranscriptDelta; use crate::endpoint::realtime_websocket::protocol::RealtimeTranscriptEntry; -use crate::endpoint::realtime_websocket::protocol::RealtimeTranscriptUpdateKind; use crate::endpoint::realtime_websocket::protocol::RealtimeVoice; use crate::endpoint::realtime_websocket::protocol::parse_realtime_event; use crate::error::ApiError; @@ -416,10 +414,20 @@ impl RealtimeWebsocketEvents { match event { RealtimeEvent::InputAudioSpeechStarted(_) => {} RealtimeEvent::InputTranscriptDelta(update) => { - update_active_transcript_entry(&mut active_transcript.entries, "user", update); + append_transcript_delta(&mut active_transcript.entries, "user", &update.delta); + } + RealtimeEvent::InputTranscriptDone(update) => { + complete_transcript_entry(&mut active_transcript.entries, "user", &update.text); } RealtimeEvent::OutputTranscriptDelta(update) => { - update_active_transcript_entry(&mut active_transcript.entries, "assistant", update); + append_transcript_delta(&mut active_transcript.entries, "assistant", &update.delta); + } + RealtimeEvent::OutputTranscriptDone(update) => { + complete_transcript_entry( + &mut active_transcript.entries, + "assistant", + &update.text, + ); } RealtimeEvent::HandoffRequested(handoff) => { if self.event_parser == RealtimeEventParser::V1 { @@ -456,21 +464,6 @@ fn append_transcript_delta(entries: &mut Vec, role: &st }); } -fn update_active_transcript_entry( - entries: &mut Vec, - role: &str, - update: &RealtimeTranscriptDelta, -) { - match update.update_kind { - RealtimeTranscriptUpdateKind::Delta => { - append_transcript_delta(entries, role, &update.delta) - } - RealtimeTranscriptUpdateKind::Done => { - complete_transcript_entry(entries, role, &update.delta) - } - } -} - fn complete_transcript_entry(entries: &mut Vec, role: &str, text: &str) { if text.is_empty() { return; @@ -770,13 +763,14 @@ fn normalize_realtime_path(url: &mut Url) { #[cfg(test)] mod tests { use super::*; - use crate::endpoint::realtime_websocket::protocol::RealtimeTranscriptDelta; use crate::endpoint::realtime_websocket::protocol::RealtimeTranscriptEntry; use codex_protocol::protocol::RealtimeHandoffRequested; use codex_protocol::protocol::RealtimeInputAudioSpeechStarted; use codex_protocol::protocol::RealtimeResponseCancelled; use codex_protocol::protocol::RealtimeResponseCreated; use codex_protocol::protocol::RealtimeResponseDone; + use codex_protocol::protocol::RealtimeTranscriptDelta; + use codex_protocol::protocol::RealtimeTranscriptDone; use codex_protocol::protocol::RealtimeVoice; use http::HeaderValue; use pretty_assertions::assert_eq; @@ -891,7 +885,6 @@ mod tests { Some(RealtimeEvent::InputTranscriptDelta( RealtimeTranscriptDelta { delta: "hello ".to_string(), - update_kind: RealtimeTranscriptUpdateKind::Delta, } )) ); @@ -910,7 +903,6 @@ mod tests { Some(RealtimeEvent::OutputTranscriptDelta( RealtimeTranscriptDelta { delta: "hi".to_string(), - update_kind: RealtimeTranscriptUpdateKind::Delta, } )) ); @@ -954,7 +946,6 @@ mod tests { Some(RealtimeEvent::InputTranscriptDelta( RealtimeTranscriptDelta { delta: "hello".to_string(), - update_kind: RealtimeTranscriptUpdateKind::Delta, } )) ); @@ -970,10 +961,9 @@ mod tests { assert_eq!( parse_realtime_event(payload.as_str(), RealtimeEventParser::RealtimeV2), - Some(RealtimeEvent::OutputTranscriptDelta( - RealtimeTranscriptDelta { - delta: "all done".to_string(), - update_kind: RealtimeTranscriptUpdateKind::Done, + Some(RealtimeEvent::OutputTranscriptDone( + RealtimeTranscriptDone { + text: "all done".to_string(), } )) ); @@ -1514,7 +1504,6 @@ mod tests { input_delta_event, RealtimeEvent::InputTranscriptDelta(RealtimeTranscriptDelta { delta: "delegate ".to_string(), - update_kind: RealtimeTranscriptUpdateKind::Delta, }) ); @@ -1527,7 +1516,6 @@ mod tests { input_delta_event, RealtimeEvent::InputTranscriptDelta(RealtimeTranscriptDelta { delta: "now".to_string(), - update_kind: RealtimeTranscriptUpdateKind::Delta, }) ); @@ -1540,7 +1528,6 @@ mod tests { output_delta_event, RealtimeEvent::OutputTranscriptDelta(RealtimeTranscriptDelta { delta: "working".to_string(), - update_kind: RealtimeTranscriptUpdateKind::Delta, }) ); diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/mod.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/mod.rs index 254849b7fd8..1fb49b2436f 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/mod.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/mod.rs @@ -16,4 +16,3 @@ pub use protocol::RealtimeEventParser; pub use protocol::RealtimeOutputModality; pub use protocol::RealtimeSessionConfig; pub use protocol::RealtimeSessionMode; -pub use protocol::RealtimeTranscriptUpdateKind; diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol.rs index b193de2e16e..0706ea24220 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol.rs @@ -3,9 +3,7 @@ use crate::endpoint::realtime_websocket::protocol_v2::parse_realtime_event_v2; pub use codex_protocol::protocol::RealtimeAudioFrame; pub use codex_protocol::protocol::RealtimeEvent; pub use codex_protocol::protocol::RealtimeOutputModality; -pub use codex_protocol::protocol::RealtimeTranscriptDelta; pub use codex_protocol::protocol::RealtimeTranscriptEntry; -pub use codex_protocol::protocol::RealtimeTranscriptUpdateKind; pub use codex_protocol::protocol::RealtimeVoice; use serde::Serialize; use serde_json::Value; diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_common.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_common.rs index 58639ee21f1..c89c5ea4d05 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_common.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_common.rs @@ -1,6 +1,6 @@ use codex_protocol::protocol::RealtimeEvent; use codex_protocol::protocol::RealtimeTranscriptDelta; -use codex_protocol::protocol::RealtimeTranscriptUpdateKind; +use codex_protocol::protocol::RealtimeTranscriptDone; use serde_json::Value; use tracing::debug; @@ -47,19 +47,22 @@ pub(super) fn parse_transcript_delta_event( parsed: &Value, field: &str, ) -> Option { - parse_transcript_update_event(parsed, field, RealtimeTranscriptUpdateKind::Delta) + parsed + .get(field) + .and_then(Value::as_str) + .map(str::to_string) + .map(|delta| RealtimeTranscriptDelta { delta }) } -pub(super) fn parse_transcript_update_event( +pub(super) fn parse_transcript_done_event( parsed: &Value, field: &str, - update_kind: RealtimeTranscriptUpdateKind, -) -> Option { +) -> Option { parsed .get(field) .and_then(Value::as_str) .map(str::to_string) - .map(|delta| RealtimeTranscriptDelta { delta, update_kind }) + .map(|text| RealtimeTranscriptDone { text }) } pub(super) fn parse_error_event(parsed: &Value) -> Option { diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v2.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v2.rs index d91d6c1d2fb..e09a60cb210 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v2.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v2.rs @@ -2,7 +2,7 @@ use crate::endpoint::realtime_websocket::protocol_common::parse_error_event; use crate::endpoint::realtime_websocket::protocol_common::parse_realtime_payload; use crate::endpoint::realtime_websocket::protocol_common::parse_session_updated_event; use crate::endpoint::realtime_websocket::protocol_common::parse_transcript_delta_event; -use crate::endpoint::realtime_websocket::protocol_common::parse_transcript_update_event; +use crate::endpoint::realtime_websocket::protocol_common::parse_transcript_done_event; use codex_protocol::protocol::RealtimeAudioFrame; use codex_protocol::protocol::RealtimeEvent; use codex_protocol::protocol::RealtimeHandoffRequested; @@ -10,7 +10,6 @@ use codex_protocol::protocol::RealtimeInputAudioSpeechStarted; use codex_protocol::protocol::RealtimeResponseCancelled; use codex_protocol::protocol::RealtimeResponseCreated; use codex_protocol::protocol::RealtimeResponseDone; -use codex_protocol::protocol::RealtimeTranscriptUpdateKind; use serde_json::Map as JsonMap; use serde_json::Value; use tracing::debug; @@ -32,19 +31,18 @@ pub(super) fn parse_realtime_event_v2(payload: &str) -> Option { parse_transcript_delta_event(&parsed, "delta").map(RealtimeEvent::InputTranscriptDelta) } "conversation.item.input_audio_transcription.completed" => { - parse_transcript_update_event(&parsed, "transcript", RealtimeTranscriptUpdateKind::Done) - .map(RealtimeEvent::InputTranscriptDelta) + parse_transcript_done_event(&parsed, "transcript") + .map(RealtimeEvent::InputTranscriptDone) } "response.output_text.delta" | "response.output_audio_transcript.delta" => { parse_transcript_delta_event(&parsed, "delta").map(RealtimeEvent::OutputTranscriptDelta) } "response.output_text.done" => { - parse_transcript_update_event(&parsed, "text", RealtimeTranscriptUpdateKind::Done) - .map(RealtimeEvent::OutputTranscriptDelta) + parse_transcript_done_event(&parsed, "text").map(RealtimeEvent::OutputTranscriptDone) } "response.output_audio_transcript.done" => { - parse_transcript_update_event(&parsed, "transcript", RealtimeTranscriptUpdateKind::Done) - .map(RealtimeEvent::OutputTranscriptDelta) + parse_transcript_done_event(&parsed, "transcript") + .map(RealtimeEvent::OutputTranscriptDone) } "input_audio_buffer.speech_started" => Some(RealtimeEvent::InputAudioSpeechStarted( RealtimeInputAudioSpeechStarted { diff --git a/codex-rs/codex-api/src/lib.rs b/codex-rs/codex-api/src/lib.rs index 1a280c8bf41..f4f90b289cc 100644 --- a/codex-rs/codex-api/src/lib.rs +++ b/codex-rs/codex-api/src/lib.rs @@ -44,7 +44,6 @@ pub use crate::endpoint::RealtimeEventParser; pub use crate::endpoint::RealtimeOutputModality; pub use crate::endpoint::RealtimeSessionConfig; pub use crate::endpoint::RealtimeSessionMode; -pub use crate::endpoint::RealtimeTranscriptUpdateKind; pub use crate::endpoint::RealtimeWebsocketClient; pub use crate::endpoint::RealtimeWebsocketConnection; pub use crate::endpoint::RealtimeWebsocketEvents; diff --git a/codex-rs/core/src/realtime_conversation.rs b/codex-rs/core/src/realtime_conversation.rs index be9568cb3aa..e749736710f 100644 --- a/codex-rs/core/src/realtime_conversation.rs +++ b/codex-rs/core/src/realtime_conversation.rs @@ -1232,7 +1232,9 @@ async fn handle_realtime_server_event( RealtimeEvent::Error(_) => true, RealtimeEvent::SessionUpdated { .. } | RealtimeEvent::InputTranscriptDelta(_) + | RealtimeEvent::InputTranscriptDone(_) | RealtimeEvent::OutputTranscriptDelta(_) + | RealtimeEvent::OutputTranscriptDone(_) | RealtimeEvent::ConversationItemAdded(_) | RealtimeEvent::ConversationItemDone { .. } => false, }; diff --git a/codex-rs/protocol/src/protocol.rs b/codex-rs/protocol/src/protocol.rs index 83d1fab5369..7431e5f1230 100644 --- a/codex-rs/protocol/src/protocol.rs +++ b/codex-rs/protocol/src/protocol.rs @@ -294,17 +294,14 @@ pub struct RealtimeAudioFrame { pub item_id: Option, } -#[derive(Debug, Clone, Copy, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] -#[serde(rename_all = "snake_case")] -pub enum RealtimeTranscriptUpdateKind { - Delta, - Done, -} - #[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] pub struct RealtimeTranscriptDelta { pub delta: String, - pub update_kind: RealtimeTranscriptUpdateKind, +} + +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] +pub struct RealtimeTranscriptDone { + pub text: String, } #[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] @@ -349,7 +346,9 @@ pub enum RealtimeEvent { }, InputAudioSpeechStarted(RealtimeInputAudioSpeechStarted), InputTranscriptDelta(RealtimeTranscriptDelta), + InputTranscriptDone(RealtimeTranscriptDone), OutputTranscriptDelta(RealtimeTranscriptDelta), + OutputTranscriptDone(RealtimeTranscriptDone), AudioOut(RealtimeAudioFrame), ResponseCreated(RealtimeResponseCreated), ResponseCancelled(RealtimeResponseCancelled), diff --git a/codex-rs/tui/src/app/app_server_adapter.rs b/codex-rs/tui/src/app/app_server_adapter.rs index 0bd78f353a9..b5eccbfc6d6 100644 --- a/codex-rs/tui/src/app/app_server_adapter.rs +++ b/codex-rs/tui/src/app/app_server_adapter.rs @@ -385,7 +385,10 @@ fn server_notification_thread_target( ServerNotification::ThreadRealtimeItemAdded(notification) => { Some(notification.thread_id.as_str()) } - ServerNotification::ThreadRealtimeTranscriptUpdated(notification) => { + ServerNotification::ThreadRealtimeTranscriptDelta(notification) => { + Some(notification.thread_id.as_str()) + } + ServerNotification::ThreadRealtimeTranscriptDone(notification) => { Some(notification.thread_id.as_str()) } ServerNotification::ThreadRealtimeOutputAudioDelta(notification) => { diff --git a/codex-rs/tui/src/chatwidget.rs b/codex-rs/tui/src/chatwidget.rs index efb4c041940..4d5275a25dc 100644 --- a/codex-rs/tui/src/chatwidget.rs +++ b/codex-rs/tui/src/chatwidget.rs @@ -6245,7 +6245,8 @@ impl ChatWidget { | ServerNotification::FsChanged(_) | ServerNotification::FuzzyFileSearchSessionUpdated(_) | ServerNotification::FuzzyFileSearchSessionCompleted(_) - | ServerNotification::ThreadRealtimeTranscriptUpdated(_) + | ServerNotification::ThreadRealtimeTranscriptDelta(_) + | ServerNotification::ThreadRealtimeTranscriptDone(_) | ServerNotification::WindowsWorldWritableWarning(_) | ServerNotification::WindowsSandboxSetupCompleted(_) | ServerNotification::AccountLoginCompleted(_) => {} diff --git a/codex-rs/tui/src/chatwidget/realtime.rs b/codex-rs/tui/src/chatwidget/realtime.rs index e6c6ed49bee..6357361f8ee 100644 --- a/codex-rs/tui/src/chatwidget/realtime.rs +++ b/codex-rs/tui/src/chatwidget/realtime.rs @@ -329,7 +329,9 @@ impl ChatWidget { } RealtimeEvent::InputAudioSpeechStarted(_) => self.interrupt_realtime_audio_playback(), RealtimeEvent::InputTranscriptDelta(_) => {} + RealtimeEvent::InputTranscriptDone(_) => {} RealtimeEvent::OutputTranscriptDelta(_) => {} + RealtimeEvent::OutputTranscriptDone(_) => {} RealtimeEvent::AudioOut(frame) => self.enqueue_realtime_audio_out(&frame), RealtimeEvent::ResponseCreated(_) => {} RealtimeEvent::ResponseCancelled(_) => self.interrupt_realtime_audio_playback(), From a4320be955bfc7aa501b72096e45128965207d7b Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Mon, 13 Apr 2026 16:24:46 -0700 Subject: [PATCH 05/17] codex: address PR review feedback (#17701) --- .../endpoint/realtime_websocket/methods.rs | 117 ++++++++++++++---- 1 file changed, 92 insertions(+), 25 deletions(-) diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs index dcd330322e6..9b7956b3469 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs @@ -211,9 +211,17 @@ pub struct RealtimeWebsocketEvents { is_closed: Arc, } -#[derive(Default)] +#[derive(Debug, Default, PartialEq, Eq)] struct ActiveTranscriptState { entries: Vec, + in_progress_parts: Vec, +} + +#[derive(Debug, PartialEq, Eq)] +struct ActiveTranscriptPart { + role: String, + entry_index: usize, + start: usize, } impl RealtimeWebsocketConnection { @@ -414,24 +422,21 @@ impl RealtimeWebsocketEvents { match event { RealtimeEvent::InputAudioSpeechStarted(_) => {} RealtimeEvent::InputTranscriptDelta(update) => { - append_transcript_delta(&mut active_transcript.entries, "user", &update.delta); + append_transcript_delta(&mut active_transcript, "user", &update.delta); } RealtimeEvent::InputTranscriptDone(update) => { - complete_transcript_entry(&mut active_transcript.entries, "user", &update.text); + complete_transcript_entry(&mut active_transcript, "user", &update.text); } RealtimeEvent::OutputTranscriptDelta(update) => { - append_transcript_delta(&mut active_transcript.entries, "assistant", &update.delta); + append_transcript_delta(&mut active_transcript, "assistant", &update.delta); } RealtimeEvent::OutputTranscriptDone(update) => { - complete_transcript_entry( - &mut active_transcript.entries, - "assistant", - &update.text, - ); + complete_transcript_entry(&mut active_transcript, "assistant", &update.text); } RealtimeEvent::HandoffRequested(handoff) => { if self.event_parser == RealtimeEventParser::V1 { handoff.active_transcript = std::mem::take(&mut active_transcript.entries); + active_transcript.in_progress_parts.clear(); } } RealtimeEvent::SessionUpdated { .. } @@ -446,39 +451,80 @@ impl RealtimeWebsocketEvents { } } -fn append_transcript_delta(entries: &mut Vec, role: &str, delta: &str) { +fn append_transcript_delta(state: &mut ActiveTranscriptState, role: &str, delta: &str) { if delta.is_empty() { return; } - if let Some(last_entry) = entries.last_mut() - && last_entry.role == role - { - last_entry.text.push_str(delta); - return; - } + let entry_index = match state.entries.last() { + Some(last_entry) if last_entry.role == role => state.entries.len() - 1, + _ => { + state.entries.push(RealtimeTranscriptEntry { + role: role.to_string(), + text: String::new(), + }); + state.entries.len() - 1 + } + }; - entries.push(RealtimeTranscriptEntry { + let part_index = state + .in_progress_parts + .iter() + .position(|part| part.role == role); + let start = part_index + .and_then(|index| state.in_progress_parts.get(index)) + .filter(|part| part.entry_index == entry_index) + .map(|part| part.start) + .unwrap_or_else(|| state.entries[entry_index].text.len()); + + let active_part = ActiveTranscriptPart { role: role.to_string(), - text: delta.to_string(), - }); + entry_index, + start, + }; + if let Some(index) = part_index { + state.in_progress_parts[index] = active_part; + } else { + state.in_progress_parts.push(active_part); + } + + state.entries[entry_index].text.push_str(delta); } -fn complete_transcript_entry(entries: &mut Vec, role: &str, text: &str) { +fn complete_transcript_entry(state: &mut ActiveTranscriptState, role: &str, text: &str) { + let part_index = state + .in_progress_parts + .iter() + .position(|part| part.role == role); + if text.is_empty() { + if let Some(part_index) = part_index { + state.in_progress_parts.swap_remove(part_index); + } return; } - // Final transcript events carry the complete part. Replace the in-progress entry - // instead of appending, so active transcript state stays usable after deltas. - if let Some(last_entry) = entries.last_mut() + if let Some(part_index) = part_index { + let part = state.in_progress_parts.swap_remove(part_index); + if let Some(entry) = state.entries.get_mut(part.entry_index) + && entry.role == role + && part.start <= entry.text.len() + { + // Done events carry the complete current part, not the complete + // message, so only replace the suffix accumulated for that part. + entry.text.replace_range(part.start.., text); + return; + } + } + + if let Some(last_entry) = state.entries.last_mut() && last_entry.role == role { - last_entry.text = text.to_string(); + last_entry.text.push_str(text); return; } - entries.push(RealtimeTranscriptEntry { + state.entries.push(RealtimeTranscriptEntry { role: role.to_string(), text: text.to_string(), }); @@ -969,6 +1015,27 @@ mod tests { ); } + #[test] + fn complete_transcript_entry_replaces_current_part_only() { + let mut state = ActiveTranscriptState::default(); + + append_transcript_delta(&mut state, "assistant", "hello"); + complete_transcript_entry(&mut state, "assistant", "hello"); + append_transcript_delta(&mut state, "assistant", " wor"); + complete_transcript_entry(&mut state, "assistant", " world"); + + assert_eq!( + state, + ActiveTranscriptState { + entries: vec![RealtimeTranscriptEntry { + role: "assistant".to_string(), + text: "hello world".to_string(), + }], + in_progress_parts: Vec::new(), + } + ); + } + #[test] fn parse_realtime_v2_output_audio_delta_defaults_audio_shape() { let payload = json!({ From 96ea906c8c1c4ef59bf14b272cdfb1f675be22d0 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Mon, 13 Apr 2026 16:31:03 -0700 Subject: [PATCH 06/17] codex: fix CI failure on PR #17701 --- codex-rs/core/tests/suite/realtime_conversation.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/codex-rs/core/tests/suite/realtime_conversation.rs b/codex-rs/core/tests/suite/realtime_conversation.rs index 1876a084a92..174491f9d60 100644 --- a/codex-rs/core/tests/suite/realtime_conversation.rs +++ b/codex-rs/core/tests/suite/realtime_conversation.rs @@ -1972,6 +1972,7 @@ async fn conversation_user_text_turn_is_capped_when_mirrored_to_realtime() -> Re // active WebSocket session. test.codex .submit(Op::RealtimeConversationStart(ConversationStartParams { + output_modality: RealtimeOutputModality::Audio, prompt: Some(Some("backend prompt".to_string())), session_id: None, transport: None, From 837415946ff3af281c875fb7f8a5171600f1c8f3 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Mon, 13 Apr 2026 16:44:49 -0700 Subject: [PATCH 07/17] codex: address PR review feedback (#17701) --- .../endpoint/realtime_websocket/methods.rs | 242 ++++++++++++++---- .../realtime_websocket/protocol_common.rs | 28 +- codex-rs/protocol/src/protocol.rs | 30 ++- 3 files changed, 244 insertions(+), 56 deletions(-) diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs index 9b7956b3469..4fa6ba554cc 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs @@ -17,6 +17,8 @@ use crate::error::ApiError; use crate::provider::Provider; use codex_client::backoff; use codex_client::maybe_build_rustls_client_config_with_custom_ca; +use codex_protocol::protocol::RealtimeTranscriptDelta; +use codex_protocol::protocol::RealtimeTranscriptDone; use codex_utils_rustls_provider::ensure_rustls_crypto_provider; use futures::SinkExt; use futures::StreamExt; @@ -219,9 +221,18 @@ struct ActiveTranscriptState { #[derive(Debug, PartialEq, Eq)] struct ActiveTranscriptPart { - role: String, + key: TranscriptPartKey, entry_index: usize, start: usize, + end: usize, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct TranscriptPartKey { + role: String, + item_id: Option, + output_index: Option, + content_index: Option, } impl RealtimeWebsocketConnection { @@ -422,16 +433,16 @@ impl RealtimeWebsocketEvents { match event { RealtimeEvent::InputAudioSpeechStarted(_) => {} RealtimeEvent::InputTranscriptDelta(update) => { - append_transcript_delta(&mut active_transcript, "user", &update.delta); + append_transcript_delta(&mut active_transcript, "user", update); } RealtimeEvent::InputTranscriptDone(update) => { - complete_transcript_entry(&mut active_transcript, "user", &update.text); + complete_transcript_entry(&mut active_transcript, "user", update); } RealtimeEvent::OutputTranscriptDelta(update) => { - append_transcript_delta(&mut active_transcript, "assistant", &update.delta); + append_transcript_delta(&mut active_transcript, "assistant", update); } RealtimeEvent::OutputTranscriptDone(update) => { - complete_transcript_entry(&mut active_transcript, "assistant", &update.text); + complete_transcript_entry(&mut active_transcript, "assistant", update); } RealtimeEvent::HandoffRequested(handoff) => { if self.event_parser == RealtimeEventParser::V1 { @@ -451,72 +462,106 @@ impl RealtimeWebsocketEvents { } } -fn append_transcript_delta(state: &mut ActiveTranscriptState, role: &str, delta: &str) { - if delta.is_empty() { +fn append_transcript_delta( + state: &mut ActiveTranscriptState, + role: &str, + update: &RealtimeTranscriptDelta, +) { + if update.delta.is_empty() { return; } - let entry_index = match state.entries.last() { - Some(last_entry) if last_entry.role == role => state.entries.len() - 1, - _ => { - state.entries.push(RealtimeTranscriptEntry { - role: role.to_string(), - text: String::new(), - }); - state.entries.len() - 1 - } - }; + let key = transcript_part_key( + role, + update.item_id.as_deref(), + update.output_index, + update.content_index, + ); - let part_index = state + if let Some(part_index) = state .in_progress_parts .iter() - .position(|part| part.role == role); - let start = part_index - .and_then(|index| state.in_progress_parts.get(index)) - .filter(|part| part.entry_index == entry_index) - .map(|part| part.start) - .unwrap_or_else(|| state.entries[entry_index].text.len()); - - let active_part = ActiveTranscriptPart { - role: role.to_string(), - entry_index, - start, - }; - if let Some(index) = part_index { - state.in_progress_parts[index] = active_part; - } else { - state.in_progress_parts.push(active_part); + .position(|part| part.key == key) + { + let entry_index = state.in_progress_parts[part_index].entry_index; + if state + .entries + .get(entry_index) + .is_some_and(|entry| entry.role == role) + { + let insert_at = state.in_progress_parts[part_index].end; + state.entries[entry_index] + .text + .insert_str(insert_at, &update.delta); + adjust_transcript_part_offsets( + state, + entry_index, + insert_at, + isize::try_from(update.delta.len()).unwrap_or(isize::MAX), + ); + state.in_progress_parts[part_index].end += update.delta.len(); + return; + } + + state.in_progress_parts.swap_remove(part_index); } - state.entries[entry_index].text.push_str(delta); + let entry_index = transcript_entry_index(&mut state.entries, role); + let start = state.entries[entry_index].text.len(); + state.entries[entry_index].text.push_str(&update.delta); + + state.in_progress_parts.push(ActiveTranscriptPart { + key, + entry_index, + start, + end: start + update.delta.len(), + }); } -fn complete_transcript_entry(state: &mut ActiveTranscriptState, role: &str, text: &str) { +fn complete_transcript_entry( + state: &mut ActiveTranscriptState, + role: &str, + update: &RealtimeTranscriptDone, +) { + let key = transcript_part_key( + role, + update.item_id.as_deref(), + update.output_index, + update.content_index, + ); let part_index = state .in_progress_parts .iter() - .position(|part| part.role == role); - - if text.is_empty() { - if let Some(part_index) = part_index { - state.in_progress_parts.swap_remove(part_index); - } - return; - } + .position(|part| part.key == key); if let Some(part_index) = part_index { let part = state.in_progress_parts.swap_remove(part_index); - if let Some(entry) = state.entries.get_mut(part.entry_index) + let old_len = part.end.saturating_sub(part.start); + let replaced = if let Some(entry) = state.entries.get_mut(part.entry_index) && entry.role == role - && part.start <= entry.text.len() + && part.end <= entry.text.len() { // Done events carry the complete current part, not the complete - // message, so only replace the suffix accumulated for that part. - entry.text.replace_range(part.start.., text); + // message, so only replace the range accumulated for that part. + entry.text.replace_range(part.start..part.end, &update.text); + true + } else { + false + }; + + if replaced { + let length_delta = isize::try_from(update.text.len()).unwrap_or(isize::MAX) + - isize::try_from(old_len).unwrap_or(isize::MAX); + adjust_transcript_part_offsets(state, part.entry_index, part.end, length_delta); return; } } + let text = &update.text; + if text.is_empty() { + return; + } + if let Some(last_entry) = state.entries.last_mut() && last_entry.role == role { @@ -530,6 +575,59 @@ fn complete_transcript_entry(state: &mut ActiveTranscriptState, role: &str, text }); } +fn transcript_part_key( + role: &str, + item_id: Option<&str>, + output_index: Option, + content_index: Option, +) -> TranscriptPartKey { + TranscriptPartKey { + role: role.to_string(), + item_id: item_id.map(str::to_string), + output_index, + content_index, + } +} + +fn transcript_entry_index(entries: &mut Vec, role: &str) -> usize { + match entries.last() { + Some(last_entry) if last_entry.role == role => entries.len() - 1, + _ => { + entries.push(RealtimeTranscriptEntry { + role: role.to_string(), + text: String::new(), + }); + entries.len() - 1 + } + } +} + +fn adjust_transcript_part_offsets( + state: &mut ActiveTranscriptState, + entry_index: usize, + after: usize, + amount: isize, +) { + if amount == 0 { + return; + } + + for part in &mut state.in_progress_parts { + if part.entry_index == entry_index && part.start >= after { + adjust_offset(&mut part.start, amount); + adjust_offset(&mut part.end, amount); + } + } +} + +fn adjust_offset(offset: &mut usize, amount: isize) { + if amount.is_positive() { + *offset += amount.unsigned_abs(); + } else { + *offset -= amount.unsigned_abs(); + } +} + pub struct RealtimeWebsocketClient { provider: Provider, } @@ -931,6 +1029,7 @@ mod tests { Some(RealtimeEvent::InputTranscriptDelta( RealtimeTranscriptDelta { delta: "hello ".to_string(), + ..Default::default() } )) ); @@ -949,6 +1048,7 @@ mod tests { Some(RealtimeEvent::OutputTranscriptDelta( RealtimeTranscriptDelta { delta: "hi".to_string(), + ..Default::default() } )) ); @@ -983,6 +1083,8 @@ mod tests { fn parse_realtime_v2_input_audio_transcription_delta_event() { let payload = json!({ "type": "conversation.item.input_audio_transcription.delta", + "item_id": "item_input_1", + "content_index": 0, "delta": "hello" }) .to_string(); @@ -992,6 +1094,9 @@ mod tests { Some(RealtimeEvent::InputTranscriptDelta( RealtimeTranscriptDelta { delta: "hello".to_string(), + item_id: Some("item_input_1".to_string()), + output_index: None, + content_index: Some(0), } )) ); @@ -1001,6 +1106,9 @@ mod tests { fn parse_realtime_v2_output_text_done_event() { let payload = json!({ "type": "response.output_text.done", + "item_id": "item_output_1", + "output_index": 0, + "content_index": 1, "text": "all done" }) .to_string(); @@ -1010,6 +1118,9 @@ mod tests { Some(RealtimeEvent::OutputTranscriptDone( RealtimeTranscriptDone { text: "all done".to_string(), + item_id: Some("item_output_1".to_string()), + output_index: Some(0), + content_index: Some(1), } )) ); @@ -1018,11 +1129,35 @@ mod tests { #[test] fn complete_transcript_entry_replaces_current_part_only() { let mut state = ActiveTranscriptState::default(); + let first_part_delta = RealtimeTranscriptDelta { + delta: "hel".to_string(), + item_id: Some("item_output_1".to_string()), + output_index: Some(0), + content_index: Some(0), + }; + let second_part_delta = RealtimeTranscriptDelta { + delta: " wor".to_string(), + item_id: Some("item_output_1".to_string()), + output_index: Some(0), + content_index: Some(1), + }; + let first_part_done = RealtimeTranscriptDone { + text: "hello".to_string(), + item_id: Some("item_output_1".to_string()), + output_index: Some(0), + content_index: Some(0), + }; + let second_part_done = RealtimeTranscriptDone { + text: " world".to_string(), + item_id: Some("item_output_1".to_string()), + output_index: Some(0), + content_index: Some(1), + }; - append_transcript_delta(&mut state, "assistant", "hello"); - complete_transcript_entry(&mut state, "assistant", "hello"); - append_transcript_delta(&mut state, "assistant", " wor"); - complete_transcript_entry(&mut state, "assistant", " world"); + append_transcript_delta(&mut state, "assistant", &first_part_delta); + append_transcript_delta(&mut state, "assistant", &second_part_delta); + complete_transcript_entry(&mut state, "assistant", &second_part_done); + complete_transcript_entry(&mut state, "assistant", &first_part_done); assert_eq!( state, @@ -1571,6 +1706,7 @@ mod tests { input_delta_event, RealtimeEvent::InputTranscriptDelta(RealtimeTranscriptDelta { delta: "delegate ".to_string(), + ..Default::default() }) ); @@ -1583,6 +1719,7 @@ mod tests { input_delta_event, RealtimeEvent::InputTranscriptDelta(RealtimeTranscriptDelta { delta: "now".to_string(), + ..Default::default() }) ); @@ -1595,6 +1732,7 @@ mod tests { output_delta_event, RealtimeEvent::OutputTranscriptDelta(RealtimeTranscriptDelta { delta: "working".to_string(), + ..Default::default() }) ); diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_common.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_common.rs index c89c5ea4d05..00513dc84e8 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_common.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_common.rs @@ -51,7 +51,12 @@ pub(super) fn parse_transcript_delta_event( .get(field) .and_then(Value::as_str) .map(str::to_string) - .map(|delta| RealtimeTranscriptDelta { delta }) + .map(|delta| RealtimeTranscriptDelta { + delta, + item_id: parse_string_field(parsed, "item_id"), + output_index: parse_u32_field(parsed, "output_index"), + content_index: parse_u32_field(parsed, "content_index"), + }) } pub(super) fn parse_transcript_done_event( @@ -62,7 +67,26 @@ pub(super) fn parse_transcript_done_event( .get(field) .and_then(Value::as_str) .map(str::to_string) - .map(|text| RealtimeTranscriptDone { text }) + .map(|text| RealtimeTranscriptDone { + text, + item_id: parse_string_field(parsed, "item_id"), + output_index: parse_u32_field(parsed, "output_index"), + content_index: parse_u32_field(parsed, "content_index"), + }) +} + +fn parse_string_field(parsed: &Value, field: &str) -> Option { + parsed + .get(field) + .and_then(Value::as_str) + .map(str::to_string) +} + +fn parse_u32_field(parsed: &Value, field: &str) -> Option { + parsed + .get(field) + .and_then(Value::as_u64) + .and_then(|value| u32::try_from(value).ok()) } pub(super) fn parse_error_event(parsed: &Value) -> Option { diff --git a/codex-rs/protocol/src/protocol.rs b/codex-rs/protocol/src/protocol.rs index 7431e5f1230..57e17a287bd 100644 --- a/codex-rs/protocol/src/protocol.rs +++ b/codex-rs/protocol/src/protocol.rs @@ -294,14 +294,40 @@ pub struct RealtimeAudioFrame { pub item_id: Option, } -#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] +#[derive(Debug, Clone, Default, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] pub struct RealtimeTranscriptDelta { pub delta: String, + /// Internal key material used to reconcile streamed transcript parts. + #[serde(skip)] + #[schemars(skip)] + #[ts(skip)] + pub item_id: Option, + #[serde(skip)] + #[schemars(skip)] + #[ts(skip)] + pub output_index: Option, + #[serde(skip)] + #[schemars(skip)] + #[ts(skip)] + pub content_index: Option, } -#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] +#[derive(Debug, Clone, Default, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] pub struct RealtimeTranscriptDone { pub text: String, + /// Internal key material used to reconcile streamed transcript parts. + #[serde(skip)] + #[schemars(skip)] + #[ts(skip)] + pub item_id: Option, + #[serde(skip)] + #[schemars(skip)] + #[ts(skip)] + pub output_index: Option, + #[serde(skip)] + #[schemars(skip)] + #[ts(skip)] + pub content_index: Option, } #[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] From 2eb4413eb3761c6342d473f02a8bce4fa9c1756c Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Mon, 13 Apr 2026 16:46:11 -0700 Subject: [PATCH 08/17] codex: fix CI failure on PR #17701 --- .../src/endpoint/realtime_websocket/methods.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs index 4fa6ba554cc..7173b457502 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs @@ -1130,25 +1130,25 @@ mod tests { fn complete_transcript_entry_replaces_current_part_only() { let mut state = ActiveTranscriptState::default(); let first_part_delta = RealtimeTranscriptDelta { - delta: "hel".to_string(), + delta: "hello".to_string(), item_id: Some("item_output_1".to_string()), output_index: Some(0), content_index: Some(0), }; let second_part_delta = RealtimeTranscriptDelta { - delta: " wor".to_string(), + delta: "beta".to_string(), item_id: Some("item_output_1".to_string()), output_index: Some(0), content_index: Some(1), }; let first_part_done = RealtimeTranscriptDone { - text: "hello".to_string(), + text: "hello!".to_string(), item_id: Some("item_output_1".to_string()), output_index: Some(0), content_index: Some(0), }; let second_part_done = RealtimeTranscriptDone { - text: " world".to_string(), + text: " beta".to_string(), item_id: Some("item_output_1".to_string()), output_index: Some(0), content_index: Some(1), @@ -1156,15 +1156,15 @@ mod tests { append_transcript_delta(&mut state, "assistant", &first_part_delta); append_transcript_delta(&mut state, "assistant", &second_part_delta); - complete_transcript_entry(&mut state, "assistant", &second_part_done); complete_transcript_entry(&mut state, "assistant", &first_part_done); + complete_transcript_entry(&mut state, "assistant", &second_part_done); assert_eq!( state, ActiveTranscriptState { entries: vec![RealtimeTranscriptEntry { role: "assistant".to_string(), - text: "hello world".to_string(), + text: "hello! beta".to_string(), }], in_progress_parts: Vec::new(), } From fa4f84f92ab2c1ac180ad47f32fcd80f5d7270dc Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Mon, 13 Apr 2026 16:53:02 -0700 Subject: [PATCH 09/17] codex: fix CI failure on PR #17701 --- .../endpoint/realtime_websocket/methods.rs | 67 ++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs index 7173b457502..b1f02508751 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs @@ -477,6 +477,7 @@ fn append_transcript_delta( update.output_index, update.content_index, ); + let key_has_metadata = transcript_part_key_has_metadata(&key); if let Some(part_index) = state .in_progress_parts @@ -488,6 +489,7 @@ fn append_transcript_delta( .entries .get(entry_index) .is_some_and(|entry| entry.role == role) + && (key_has_metadata || entry_index == state.entries.len().saturating_sub(1)) { let insert_at = state.in_progress_parts[part_index].end; state.entries[entry_index] @@ -529,6 +531,7 @@ fn complete_transcript_entry( update.output_index, update.content_index, ); + let key_has_metadata = transcript_part_key_has_metadata(&key); let part_index = state .in_progress_parts .iter() @@ -537,7 +540,10 @@ fn complete_transcript_entry( if let Some(part_index) = part_index { let part = state.in_progress_parts.swap_remove(part_index); let old_len = part.end.saturating_sub(part.start); - let replaced = if let Some(entry) = state.entries.get_mut(part.entry_index) + let entry_is_current = + key_has_metadata || part.entry_index == state.entries.len().saturating_sub(1); + let replaced = if entry_is_current + && let Some(entry) = state.entries.get_mut(part.entry_index) && entry.role == role && part.end <= entry.text.len() { @@ -589,6 +595,10 @@ fn transcript_part_key( } } +fn transcript_part_key_has_metadata(key: &TranscriptPartKey) -> bool { + key.item_id.is_some() || key.output_index.is_some() || key.content_index.is_some() +} + fn transcript_entry_index(entries: &mut Vec, role: &str) -> usize { match entries.last() { Some(last_entry) if last_entry.role == role => entries.len() - 1, @@ -1171,6 +1181,61 @@ mod tests { ); } + #[test] + fn unkeyed_transcript_deltas_start_new_entry_after_role_change() { + let mut state = ActiveTranscriptState::default(); + let assistant_context = RealtimeTranscriptDelta { + delta: "assistant context".to_string(), + ..Default::default() + }; + let delegated_query = RealtimeTranscriptDelta { + delta: "delegated query".to_string(), + ..Default::default() + }; + let assist_confirm = RealtimeTranscriptDelta { + delta: "assist confirm".to_string(), + ..Default::default() + }; + + append_transcript_delta(&mut state, "assistant", &assistant_context); + append_transcript_delta(&mut state, "user", &delegated_query); + append_transcript_delta(&mut state, "assistant", &assist_confirm); + + assert_eq!( + state, + ActiveTranscriptState { + entries: vec![ + RealtimeTranscriptEntry { + role: "assistant".to_string(), + text: "assistant context".to_string(), + }, + RealtimeTranscriptEntry { + role: "user".to_string(), + text: "delegated query".to_string(), + }, + RealtimeTranscriptEntry { + role: "assistant".to_string(), + text: "assist confirm".to_string(), + }, + ], + in_progress_parts: vec![ + ActiveTranscriptPart { + key: transcript_part_key("user", None, None, None), + entry_index: 1, + start: 0, + end: "delegated query".len(), + }, + ActiveTranscriptPart { + key: transcript_part_key("assistant", None, None, None), + entry_index: 2, + start: 0, + end: "assist confirm".len(), + }, + ], + } + ); + } + #[test] fn parse_realtime_v2_output_audio_delta_defaults_audio_shape() { let payload = json!({ From be8ac82be5748b4a355dcd2109c008f224c4eb90 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Mon, 13 Apr 2026 16:59:15 -0700 Subject: [PATCH 10/17] codex: fix CI failure on PR #17701 --- .../src/endpoint/realtime_websocket/methods.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs index b1f02508751..3b21a728202 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs @@ -1220,13 +1220,21 @@ mod tests { ], in_progress_parts: vec![ ActiveTranscriptPart { - key: transcript_part_key("user", None, None, None), + key: transcript_part_key( + "user", /*item_id*/ None, /*output_index*/ None, + /*content_index*/ None, + ), entry_index: 1, start: 0, end: "delegated query".len(), }, ActiveTranscriptPart { - key: transcript_part_key("assistant", None, None, None), + key: transcript_part_key( + "assistant", + /*item_id*/ None, + /*output_index*/ None, + /*content_index*/ None, + ), entry_index: 2, start: 0, end: "assist confirm".len(), From 24fff3c2820cac8a67b17812771fee64bea9804a Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Mon, 13 Apr 2026 22:09:10 -0700 Subject: [PATCH 11/17] codex: simplify realtime transcript done handling --- .../endpoint/realtime_websocket/methods.rs | 322 +----------------- .../realtime_websocket/protocol_common.rs | 28 +- codex-rs/protocol/src/protocol.rs | 30 +- 3 files changed, 15 insertions(+), 365 deletions(-) diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs index 3b21a728202..aeaa374ef74 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs @@ -17,7 +17,6 @@ use crate::error::ApiError; use crate::provider::Provider; use codex_client::backoff; use codex_client::maybe_build_rustls_client_config_with_custom_ca; -use codex_protocol::protocol::RealtimeTranscriptDelta; use codex_protocol::protocol::RealtimeTranscriptDone; use codex_utils_rustls_provider::ensure_rustls_crypto_provider; use futures::SinkExt; @@ -208,33 +207,11 @@ pub struct RealtimeWebsocketWriter { #[derive(Clone)] pub struct RealtimeWebsocketEvents { rx_message: Arc>>>, - active_transcript: Arc>, + active_transcript: Arc>>, event_parser: RealtimeEventParser, is_closed: Arc, } -#[derive(Debug, Default, PartialEq, Eq)] -struct ActiveTranscriptState { - entries: Vec, - in_progress_parts: Vec, -} - -#[derive(Debug, PartialEq, Eq)] -struct ActiveTranscriptPart { - key: TranscriptPartKey, - entry_index: usize, - start: usize, - end: usize, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -struct TranscriptPartKey { - role: String, - item_id: Option, - output_index: Option, - content_index: Option, -} - impl RealtimeWebsocketConnection { pub async fn send_audio_frame(&self, frame: RealtimeAudioFrame) -> Result<(), ApiError> { self.writer.send_audio_frame(frame).await @@ -285,7 +262,7 @@ impl RealtimeWebsocketConnection { }, events: RealtimeWebsocketEvents { rx_message: Arc::new(Mutex::new(rx_message)), - active_transcript: Arc::new(Mutex::new(ActiveTranscriptState::default())), + active_transcript: Arc::new(Mutex::new(Vec::new())), event_parser, is_closed, }, @@ -432,22 +409,17 @@ impl RealtimeWebsocketEvents { let mut active_transcript = self.active_transcript.lock().await; match event { RealtimeEvent::InputAudioSpeechStarted(_) => {} - RealtimeEvent::InputTranscriptDelta(update) => { - append_transcript_delta(&mut active_transcript, "user", update); - } + RealtimeEvent::InputTranscriptDelta(_) => {} RealtimeEvent::InputTranscriptDone(update) => { - complete_transcript_entry(&mut active_transcript, "user", update); - } - RealtimeEvent::OutputTranscriptDelta(update) => { - append_transcript_delta(&mut active_transcript, "assistant", update); + append_transcript_done(&mut active_transcript, "user", update); } + RealtimeEvent::OutputTranscriptDelta(_) => {} RealtimeEvent::OutputTranscriptDone(update) => { - complete_transcript_entry(&mut active_transcript, "assistant", update); + append_transcript_done(&mut active_transcript, "assistant", update); } RealtimeEvent::HandoffRequested(handoff) => { if self.event_parser == RealtimeEventParser::V1 { - handoff.active_transcript = std::mem::take(&mut active_transcript.entries); - active_transcript.in_progress_parts.clear(); + handoff.active_transcript = std::mem::take(&mut *active_transcript); } } RealtimeEvent::SessionUpdated { .. } @@ -462,182 +434,29 @@ impl RealtimeWebsocketEvents { } } -fn append_transcript_delta( - state: &mut ActiveTranscriptState, - role: &str, - update: &RealtimeTranscriptDelta, -) { - if update.delta.is_empty() { - return; - } - - let key = transcript_part_key( - role, - update.item_id.as_deref(), - update.output_index, - update.content_index, - ); - let key_has_metadata = transcript_part_key_has_metadata(&key); - - if let Some(part_index) = state - .in_progress_parts - .iter() - .position(|part| part.key == key) - { - let entry_index = state.in_progress_parts[part_index].entry_index; - if state - .entries - .get(entry_index) - .is_some_and(|entry| entry.role == role) - && (key_has_metadata || entry_index == state.entries.len().saturating_sub(1)) - { - let insert_at = state.in_progress_parts[part_index].end; - state.entries[entry_index] - .text - .insert_str(insert_at, &update.delta); - adjust_transcript_part_offsets( - state, - entry_index, - insert_at, - isize::try_from(update.delta.len()).unwrap_or(isize::MAX), - ); - state.in_progress_parts[part_index].end += update.delta.len(); - return; - } - - state.in_progress_parts.swap_remove(part_index); - } - - let entry_index = transcript_entry_index(&mut state.entries, role); - let start = state.entries[entry_index].text.len(); - state.entries[entry_index].text.push_str(&update.delta); - - state.in_progress_parts.push(ActiveTranscriptPart { - key, - entry_index, - start, - end: start + update.delta.len(), - }); -} - -fn complete_transcript_entry( - state: &mut ActiveTranscriptState, +fn append_transcript_done( + entries: &mut Vec, role: &str, update: &RealtimeTranscriptDone, ) { - let key = transcript_part_key( - role, - update.item_id.as_deref(), - update.output_index, - update.content_index, - ); - let key_has_metadata = transcript_part_key_has_metadata(&key); - let part_index = state - .in_progress_parts - .iter() - .position(|part| part.key == key); - - if let Some(part_index) = part_index { - let part = state.in_progress_parts.swap_remove(part_index); - let old_len = part.end.saturating_sub(part.start); - let entry_is_current = - key_has_metadata || part.entry_index == state.entries.len().saturating_sub(1); - let replaced = if entry_is_current - && let Some(entry) = state.entries.get_mut(part.entry_index) - && entry.role == role - && part.end <= entry.text.len() - { - // Done events carry the complete current part, not the complete - // message, so only replace the range accumulated for that part. - entry.text.replace_range(part.start..part.end, &update.text); - true - } else { - false - }; - - if replaced { - let length_delta = isize::try_from(update.text.len()).unwrap_or(isize::MAX) - - isize::try_from(old_len).unwrap_or(isize::MAX); - adjust_transcript_part_offsets(state, part.entry_index, part.end, length_delta); - return; - } - } - let text = &update.text; if text.is_empty() { return; } - if let Some(last_entry) = state.entries.last_mut() + if let Some(last_entry) = entries.last_mut() && last_entry.role == role { last_entry.text.push_str(text); return; } - state.entries.push(RealtimeTranscriptEntry { + entries.push(RealtimeTranscriptEntry { role: role.to_string(), text: text.to_string(), }); } -fn transcript_part_key( - role: &str, - item_id: Option<&str>, - output_index: Option, - content_index: Option, -) -> TranscriptPartKey { - TranscriptPartKey { - role: role.to_string(), - item_id: item_id.map(str::to_string), - output_index, - content_index, - } -} - -fn transcript_part_key_has_metadata(key: &TranscriptPartKey) -> bool { - key.item_id.is_some() || key.output_index.is_some() || key.content_index.is_some() -} - -fn transcript_entry_index(entries: &mut Vec, role: &str) -> usize { - match entries.last() { - Some(last_entry) if last_entry.role == role => entries.len() - 1, - _ => { - entries.push(RealtimeTranscriptEntry { - role: role.to_string(), - text: String::new(), - }); - entries.len() - 1 - } - } -} - -fn adjust_transcript_part_offsets( - state: &mut ActiveTranscriptState, - entry_index: usize, - after: usize, - amount: isize, -) { - if amount == 0 { - return; - } - - for part in &mut state.in_progress_parts { - if part.entry_index == entry_index && part.start >= after { - adjust_offset(&mut part.start, amount); - adjust_offset(&mut part.end, amount); - } - } -} - -fn adjust_offset(offset: &mut usize, amount: isize) { - if amount.is_positive() { - *offset += amount.unsigned_abs(); - } else { - *offset -= amount.unsigned_abs(); - } -} - pub struct RealtimeWebsocketClient { provider: Provider, } @@ -1039,7 +858,6 @@ mod tests { Some(RealtimeEvent::InputTranscriptDelta( RealtimeTranscriptDelta { delta: "hello ".to_string(), - ..Default::default() } )) ); @@ -1058,7 +876,6 @@ mod tests { Some(RealtimeEvent::OutputTranscriptDelta( RealtimeTranscriptDelta { delta: "hi".to_string(), - ..Default::default() } )) ); @@ -1104,9 +921,6 @@ mod tests { Some(RealtimeEvent::InputTranscriptDelta( RealtimeTranscriptDelta { delta: "hello".to_string(), - item_id: Some("item_input_1".to_string()), - output_index: None, - content_index: Some(0), } )) ); @@ -1128,122 +942,11 @@ mod tests { Some(RealtimeEvent::OutputTranscriptDone( RealtimeTranscriptDone { text: "all done".to_string(), - item_id: Some("item_output_1".to_string()), - output_index: Some(0), - content_index: Some(1), } )) ); } - #[test] - fn complete_transcript_entry_replaces_current_part_only() { - let mut state = ActiveTranscriptState::default(); - let first_part_delta = RealtimeTranscriptDelta { - delta: "hello".to_string(), - item_id: Some("item_output_1".to_string()), - output_index: Some(0), - content_index: Some(0), - }; - let second_part_delta = RealtimeTranscriptDelta { - delta: "beta".to_string(), - item_id: Some("item_output_1".to_string()), - output_index: Some(0), - content_index: Some(1), - }; - let first_part_done = RealtimeTranscriptDone { - text: "hello!".to_string(), - item_id: Some("item_output_1".to_string()), - output_index: Some(0), - content_index: Some(0), - }; - let second_part_done = RealtimeTranscriptDone { - text: " beta".to_string(), - item_id: Some("item_output_1".to_string()), - output_index: Some(0), - content_index: Some(1), - }; - - append_transcript_delta(&mut state, "assistant", &first_part_delta); - append_transcript_delta(&mut state, "assistant", &second_part_delta); - complete_transcript_entry(&mut state, "assistant", &first_part_done); - complete_transcript_entry(&mut state, "assistant", &second_part_done); - - assert_eq!( - state, - ActiveTranscriptState { - entries: vec![RealtimeTranscriptEntry { - role: "assistant".to_string(), - text: "hello! beta".to_string(), - }], - in_progress_parts: Vec::new(), - } - ); - } - - #[test] - fn unkeyed_transcript_deltas_start_new_entry_after_role_change() { - let mut state = ActiveTranscriptState::default(); - let assistant_context = RealtimeTranscriptDelta { - delta: "assistant context".to_string(), - ..Default::default() - }; - let delegated_query = RealtimeTranscriptDelta { - delta: "delegated query".to_string(), - ..Default::default() - }; - let assist_confirm = RealtimeTranscriptDelta { - delta: "assist confirm".to_string(), - ..Default::default() - }; - - append_transcript_delta(&mut state, "assistant", &assistant_context); - append_transcript_delta(&mut state, "user", &delegated_query); - append_transcript_delta(&mut state, "assistant", &assist_confirm); - - assert_eq!( - state, - ActiveTranscriptState { - entries: vec![ - RealtimeTranscriptEntry { - role: "assistant".to_string(), - text: "assistant context".to_string(), - }, - RealtimeTranscriptEntry { - role: "user".to_string(), - text: "delegated query".to_string(), - }, - RealtimeTranscriptEntry { - role: "assistant".to_string(), - text: "assist confirm".to_string(), - }, - ], - in_progress_parts: vec![ - ActiveTranscriptPart { - key: transcript_part_key( - "user", /*item_id*/ None, /*output_index*/ None, - /*content_index*/ None, - ), - entry_index: 1, - start: 0, - end: "delegated query".len(), - }, - ActiveTranscriptPart { - key: transcript_part_key( - "assistant", - /*item_id*/ None, - /*output_index*/ None, - /*content_index*/ None, - ), - entry_index: 2, - start: 0, - end: "assist confirm".len(), - }, - ], - } - ); - } - #[test] fn parse_realtime_v2_output_audio_delta_defaults_audio_shape() { let payload = json!({ @@ -1779,7 +1482,6 @@ mod tests { input_delta_event, RealtimeEvent::InputTranscriptDelta(RealtimeTranscriptDelta { delta: "delegate ".to_string(), - ..Default::default() }) ); @@ -1792,7 +1494,6 @@ mod tests { input_delta_event, RealtimeEvent::InputTranscriptDelta(RealtimeTranscriptDelta { delta: "now".to_string(), - ..Default::default() }) ); @@ -1805,7 +1506,6 @@ mod tests { output_delta_event, RealtimeEvent::OutputTranscriptDelta(RealtimeTranscriptDelta { delta: "working".to_string(), - ..Default::default() }) ); diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_common.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_common.rs index 00513dc84e8..c89c5ea4d05 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_common.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_common.rs @@ -51,12 +51,7 @@ pub(super) fn parse_transcript_delta_event( .get(field) .and_then(Value::as_str) .map(str::to_string) - .map(|delta| RealtimeTranscriptDelta { - delta, - item_id: parse_string_field(parsed, "item_id"), - output_index: parse_u32_field(parsed, "output_index"), - content_index: parse_u32_field(parsed, "content_index"), - }) + .map(|delta| RealtimeTranscriptDelta { delta }) } pub(super) fn parse_transcript_done_event( @@ -67,26 +62,7 @@ pub(super) fn parse_transcript_done_event( .get(field) .and_then(Value::as_str) .map(str::to_string) - .map(|text| RealtimeTranscriptDone { - text, - item_id: parse_string_field(parsed, "item_id"), - output_index: parse_u32_field(parsed, "output_index"), - content_index: parse_u32_field(parsed, "content_index"), - }) -} - -fn parse_string_field(parsed: &Value, field: &str) -> Option { - parsed - .get(field) - .and_then(Value::as_str) - .map(str::to_string) -} - -fn parse_u32_field(parsed: &Value, field: &str) -> Option { - parsed - .get(field) - .and_then(Value::as_u64) - .and_then(|value| u32::try_from(value).ok()) + .map(|text| RealtimeTranscriptDone { text }) } pub(super) fn parse_error_event(parsed: &Value) -> Option { diff --git a/codex-rs/protocol/src/protocol.rs b/codex-rs/protocol/src/protocol.rs index 57e17a287bd..7431e5f1230 100644 --- a/codex-rs/protocol/src/protocol.rs +++ b/codex-rs/protocol/src/protocol.rs @@ -294,40 +294,14 @@ pub struct RealtimeAudioFrame { pub item_id: Option, } -#[derive(Debug, Clone, Default, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] pub struct RealtimeTranscriptDelta { pub delta: String, - /// Internal key material used to reconcile streamed transcript parts. - #[serde(skip)] - #[schemars(skip)] - #[ts(skip)] - pub item_id: Option, - #[serde(skip)] - #[schemars(skip)] - #[ts(skip)] - pub output_index: Option, - #[serde(skip)] - #[schemars(skip)] - #[ts(skip)] - pub content_index: Option, } -#[derive(Debug, Clone, Default, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] pub struct RealtimeTranscriptDone { pub text: String, - /// Internal key material used to reconcile streamed transcript parts. - #[serde(skip)] - #[schemars(skip)] - #[ts(skip)] - pub item_id: Option, - #[serde(skip)] - #[schemars(skip)] - #[ts(skip)] - pub output_index: Option, - #[serde(skip)] - #[schemars(skip)] - #[ts(skip)] - pub content_index: Option, } #[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] From b1b26c4b7cde4b16cf80eec95acbe40cbea27eb1 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Mon, 13 Apr 2026 22:12:08 -0700 Subject: [PATCH 12/17] Revert "codex: simplify realtime transcript done handling" This reverts commit 24fff3c2820cac8a67b17812771fee64bea9804a. --- .../endpoint/realtime_websocket/methods.rs | 322 +++++++++++++++++- .../realtime_websocket/protocol_common.rs | 28 +- codex-rs/protocol/src/protocol.rs | 30 +- 3 files changed, 365 insertions(+), 15 deletions(-) diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs index aeaa374ef74..3b21a728202 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs @@ -17,6 +17,7 @@ use crate::error::ApiError; use crate::provider::Provider; use codex_client::backoff; use codex_client::maybe_build_rustls_client_config_with_custom_ca; +use codex_protocol::protocol::RealtimeTranscriptDelta; use codex_protocol::protocol::RealtimeTranscriptDone; use codex_utils_rustls_provider::ensure_rustls_crypto_provider; use futures::SinkExt; @@ -207,11 +208,33 @@ pub struct RealtimeWebsocketWriter { #[derive(Clone)] pub struct RealtimeWebsocketEvents { rx_message: Arc>>>, - active_transcript: Arc>>, + active_transcript: Arc>, event_parser: RealtimeEventParser, is_closed: Arc, } +#[derive(Debug, Default, PartialEq, Eq)] +struct ActiveTranscriptState { + entries: Vec, + in_progress_parts: Vec, +} + +#[derive(Debug, PartialEq, Eq)] +struct ActiveTranscriptPart { + key: TranscriptPartKey, + entry_index: usize, + start: usize, + end: usize, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct TranscriptPartKey { + role: String, + item_id: Option, + output_index: Option, + content_index: Option, +} + impl RealtimeWebsocketConnection { pub async fn send_audio_frame(&self, frame: RealtimeAudioFrame) -> Result<(), ApiError> { self.writer.send_audio_frame(frame).await @@ -262,7 +285,7 @@ impl RealtimeWebsocketConnection { }, events: RealtimeWebsocketEvents { rx_message: Arc::new(Mutex::new(rx_message)), - active_transcript: Arc::new(Mutex::new(Vec::new())), + active_transcript: Arc::new(Mutex::new(ActiveTranscriptState::default())), event_parser, is_closed, }, @@ -409,17 +432,22 @@ impl RealtimeWebsocketEvents { let mut active_transcript = self.active_transcript.lock().await; match event { RealtimeEvent::InputAudioSpeechStarted(_) => {} - RealtimeEvent::InputTranscriptDelta(_) => {} + RealtimeEvent::InputTranscriptDelta(update) => { + append_transcript_delta(&mut active_transcript, "user", update); + } RealtimeEvent::InputTranscriptDone(update) => { - append_transcript_done(&mut active_transcript, "user", update); + complete_transcript_entry(&mut active_transcript, "user", update); + } + RealtimeEvent::OutputTranscriptDelta(update) => { + append_transcript_delta(&mut active_transcript, "assistant", update); } - RealtimeEvent::OutputTranscriptDelta(_) => {} RealtimeEvent::OutputTranscriptDone(update) => { - append_transcript_done(&mut active_transcript, "assistant", update); + complete_transcript_entry(&mut active_transcript, "assistant", update); } RealtimeEvent::HandoffRequested(handoff) => { if self.event_parser == RealtimeEventParser::V1 { - handoff.active_transcript = std::mem::take(&mut *active_transcript); + handoff.active_transcript = std::mem::take(&mut active_transcript.entries); + active_transcript.in_progress_parts.clear(); } } RealtimeEvent::SessionUpdated { .. } @@ -434,29 +462,182 @@ impl RealtimeWebsocketEvents { } } -fn append_transcript_done( - entries: &mut Vec, +fn append_transcript_delta( + state: &mut ActiveTranscriptState, + role: &str, + update: &RealtimeTranscriptDelta, +) { + if update.delta.is_empty() { + return; + } + + let key = transcript_part_key( + role, + update.item_id.as_deref(), + update.output_index, + update.content_index, + ); + let key_has_metadata = transcript_part_key_has_metadata(&key); + + if let Some(part_index) = state + .in_progress_parts + .iter() + .position(|part| part.key == key) + { + let entry_index = state.in_progress_parts[part_index].entry_index; + if state + .entries + .get(entry_index) + .is_some_and(|entry| entry.role == role) + && (key_has_metadata || entry_index == state.entries.len().saturating_sub(1)) + { + let insert_at = state.in_progress_parts[part_index].end; + state.entries[entry_index] + .text + .insert_str(insert_at, &update.delta); + adjust_transcript_part_offsets( + state, + entry_index, + insert_at, + isize::try_from(update.delta.len()).unwrap_or(isize::MAX), + ); + state.in_progress_parts[part_index].end += update.delta.len(); + return; + } + + state.in_progress_parts.swap_remove(part_index); + } + + let entry_index = transcript_entry_index(&mut state.entries, role); + let start = state.entries[entry_index].text.len(); + state.entries[entry_index].text.push_str(&update.delta); + + state.in_progress_parts.push(ActiveTranscriptPart { + key, + entry_index, + start, + end: start + update.delta.len(), + }); +} + +fn complete_transcript_entry( + state: &mut ActiveTranscriptState, role: &str, update: &RealtimeTranscriptDone, ) { + let key = transcript_part_key( + role, + update.item_id.as_deref(), + update.output_index, + update.content_index, + ); + let key_has_metadata = transcript_part_key_has_metadata(&key); + let part_index = state + .in_progress_parts + .iter() + .position(|part| part.key == key); + + if let Some(part_index) = part_index { + let part = state.in_progress_parts.swap_remove(part_index); + let old_len = part.end.saturating_sub(part.start); + let entry_is_current = + key_has_metadata || part.entry_index == state.entries.len().saturating_sub(1); + let replaced = if entry_is_current + && let Some(entry) = state.entries.get_mut(part.entry_index) + && entry.role == role + && part.end <= entry.text.len() + { + // Done events carry the complete current part, not the complete + // message, so only replace the range accumulated for that part. + entry.text.replace_range(part.start..part.end, &update.text); + true + } else { + false + }; + + if replaced { + let length_delta = isize::try_from(update.text.len()).unwrap_or(isize::MAX) + - isize::try_from(old_len).unwrap_or(isize::MAX); + adjust_transcript_part_offsets(state, part.entry_index, part.end, length_delta); + return; + } + } + let text = &update.text; if text.is_empty() { return; } - if let Some(last_entry) = entries.last_mut() + if let Some(last_entry) = state.entries.last_mut() && last_entry.role == role { last_entry.text.push_str(text); return; } - entries.push(RealtimeTranscriptEntry { + state.entries.push(RealtimeTranscriptEntry { role: role.to_string(), text: text.to_string(), }); } +fn transcript_part_key( + role: &str, + item_id: Option<&str>, + output_index: Option, + content_index: Option, +) -> TranscriptPartKey { + TranscriptPartKey { + role: role.to_string(), + item_id: item_id.map(str::to_string), + output_index, + content_index, + } +} + +fn transcript_part_key_has_metadata(key: &TranscriptPartKey) -> bool { + key.item_id.is_some() || key.output_index.is_some() || key.content_index.is_some() +} + +fn transcript_entry_index(entries: &mut Vec, role: &str) -> usize { + match entries.last() { + Some(last_entry) if last_entry.role == role => entries.len() - 1, + _ => { + entries.push(RealtimeTranscriptEntry { + role: role.to_string(), + text: String::new(), + }); + entries.len() - 1 + } + } +} + +fn adjust_transcript_part_offsets( + state: &mut ActiveTranscriptState, + entry_index: usize, + after: usize, + amount: isize, +) { + if amount == 0 { + return; + } + + for part in &mut state.in_progress_parts { + if part.entry_index == entry_index && part.start >= after { + adjust_offset(&mut part.start, amount); + adjust_offset(&mut part.end, amount); + } + } +} + +fn adjust_offset(offset: &mut usize, amount: isize) { + if amount.is_positive() { + *offset += amount.unsigned_abs(); + } else { + *offset -= amount.unsigned_abs(); + } +} + pub struct RealtimeWebsocketClient { provider: Provider, } @@ -858,6 +1039,7 @@ mod tests { Some(RealtimeEvent::InputTranscriptDelta( RealtimeTranscriptDelta { delta: "hello ".to_string(), + ..Default::default() } )) ); @@ -876,6 +1058,7 @@ mod tests { Some(RealtimeEvent::OutputTranscriptDelta( RealtimeTranscriptDelta { delta: "hi".to_string(), + ..Default::default() } )) ); @@ -921,6 +1104,9 @@ mod tests { Some(RealtimeEvent::InputTranscriptDelta( RealtimeTranscriptDelta { delta: "hello".to_string(), + item_id: Some("item_input_1".to_string()), + output_index: None, + content_index: Some(0), } )) ); @@ -942,11 +1128,122 @@ mod tests { Some(RealtimeEvent::OutputTranscriptDone( RealtimeTranscriptDone { text: "all done".to_string(), + item_id: Some("item_output_1".to_string()), + output_index: Some(0), + content_index: Some(1), } )) ); } + #[test] + fn complete_transcript_entry_replaces_current_part_only() { + let mut state = ActiveTranscriptState::default(); + let first_part_delta = RealtimeTranscriptDelta { + delta: "hello".to_string(), + item_id: Some("item_output_1".to_string()), + output_index: Some(0), + content_index: Some(0), + }; + let second_part_delta = RealtimeTranscriptDelta { + delta: "beta".to_string(), + item_id: Some("item_output_1".to_string()), + output_index: Some(0), + content_index: Some(1), + }; + let first_part_done = RealtimeTranscriptDone { + text: "hello!".to_string(), + item_id: Some("item_output_1".to_string()), + output_index: Some(0), + content_index: Some(0), + }; + let second_part_done = RealtimeTranscriptDone { + text: " beta".to_string(), + item_id: Some("item_output_1".to_string()), + output_index: Some(0), + content_index: Some(1), + }; + + append_transcript_delta(&mut state, "assistant", &first_part_delta); + append_transcript_delta(&mut state, "assistant", &second_part_delta); + complete_transcript_entry(&mut state, "assistant", &first_part_done); + complete_transcript_entry(&mut state, "assistant", &second_part_done); + + assert_eq!( + state, + ActiveTranscriptState { + entries: vec![RealtimeTranscriptEntry { + role: "assistant".to_string(), + text: "hello! beta".to_string(), + }], + in_progress_parts: Vec::new(), + } + ); + } + + #[test] + fn unkeyed_transcript_deltas_start_new_entry_after_role_change() { + let mut state = ActiveTranscriptState::default(); + let assistant_context = RealtimeTranscriptDelta { + delta: "assistant context".to_string(), + ..Default::default() + }; + let delegated_query = RealtimeTranscriptDelta { + delta: "delegated query".to_string(), + ..Default::default() + }; + let assist_confirm = RealtimeTranscriptDelta { + delta: "assist confirm".to_string(), + ..Default::default() + }; + + append_transcript_delta(&mut state, "assistant", &assistant_context); + append_transcript_delta(&mut state, "user", &delegated_query); + append_transcript_delta(&mut state, "assistant", &assist_confirm); + + assert_eq!( + state, + ActiveTranscriptState { + entries: vec![ + RealtimeTranscriptEntry { + role: "assistant".to_string(), + text: "assistant context".to_string(), + }, + RealtimeTranscriptEntry { + role: "user".to_string(), + text: "delegated query".to_string(), + }, + RealtimeTranscriptEntry { + role: "assistant".to_string(), + text: "assist confirm".to_string(), + }, + ], + in_progress_parts: vec![ + ActiveTranscriptPart { + key: transcript_part_key( + "user", /*item_id*/ None, /*output_index*/ None, + /*content_index*/ None, + ), + entry_index: 1, + start: 0, + end: "delegated query".len(), + }, + ActiveTranscriptPart { + key: transcript_part_key( + "assistant", + /*item_id*/ None, + /*output_index*/ None, + /*content_index*/ None, + ), + entry_index: 2, + start: 0, + end: "assist confirm".len(), + }, + ], + } + ); + } + #[test] fn parse_realtime_v2_output_audio_delta_defaults_audio_shape() { let payload = json!({ @@ -1482,6 +1779,7 @@ mod tests { input_delta_event, RealtimeEvent::InputTranscriptDelta(RealtimeTranscriptDelta { delta: "delegate ".to_string(), + ..Default::default() }) ); @@ -1494,6 +1792,7 @@ mod tests { input_delta_event, RealtimeEvent::InputTranscriptDelta(RealtimeTranscriptDelta { delta: "now".to_string(), + ..Default::default() }) ); @@ -1506,6 +1805,7 @@ mod tests { output_delta_event, RealtimeEvent::OutputTranscriptDelta(RealtimeTranscriptDelta { delta: "working".to_string(), + ..Default::default() }) ); diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_common.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_common.rs index c89c5ea4d05..00513dc84e8 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_common.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_common.rs @@ -51,7 +51,12 @@ pub(super) fn parse_transcript_delta_event( .get(field) .and_then(Value::as_str) .map(str::to_string) - .map(|delta| RealtimeTranscriptDelta { delta }) + .map(|delta| RealtimeTranscriptDelta { + delta, + item_id: parse_string_field(parsed, "item_id"), + output_index: parse_u32_field(parsed, "output_index"), + content_index: parse_u32_field(parsed, "content_index"), + }) } pub(super) fn parse_transcript_done_event( @@ -62,7 +67,26 @@ pub(super) fn parse_transcript_done_event( .get(field) .and_then(Value::as_str) .map(str::to_string) - .map(|text| RealtimeTranscriptDone { text }) + .map(|text| RealtimeTranscriptDone { + text, + item_id: parse_string_field(parsed, "item_id"), + output_index: parse_u32_field(parsed, "output_index"), + content_index: parse_u32_field(parsed, "content_index"), + }) +} + +fn parse_string_field(parsed: &Value, field: &str) -> Option { + parsed + .get(field) + .and_then(Value::as_str) + .map(str::to_string) +} + +fn parse_u32_field(parsed: &Value, field: &str) -> Option { + parsed + .get(field) + .and_then(Value::as_u64) + .and_then(|value| u32::try_from(value).ok()) } pub(super) fn parse_error_event(parsed: &Value) -> Option { diff --git a/codex-rs/protocol/src/protocol.rs b/codex-rs/protocol/src/protocol.rs index 7431e5f1230..57e17a287bd 100644 --- a/codex-rs/protocol/src/protocol.rs +++ b/codex-rs/protocol/src/protocol.rs @@ -294,14 +294,40 @@ pub struct RealtimeAudioFrame { pub item_id: Option, } -#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] +#[derive(Debug, Clone, Default, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] pub struct RealtimeTranscriptDelta { pub delta: String, + /// Internal key material used to reconcile streamed transcript parts. + #[serde(skip)] + #[schemars(skip)] + #[ts(skip)] + pub item_id: Option, + #[serde(skip)] + #[schemars(skip)] + #[ts(skip)] + pub output_index: Option, + #[serde(skip)] + #[schemars(skip)] + #[ts(skip)] + pub content_index: Option, } -#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] +#[derive(Debug, Clone, Default, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] pub struct RealtimeTranscriptDone { pub text: String, + /// Internal key material used to reconcile streamed transcript parts. + #[serde(skip)] + #[schemars(skip)] + #[ts(skip)] + pub item_id: Option, + #[serde(skip)] + #[schemars(skip)] + #[ts(skip)] + pub output_index: Option, + #[serde(skip)] + #[schemars(skip)] + #[ts(skip)] + pub content_index: Option, } #[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] From b97dea890fd2fd4aa6798abf54af5d60a43a8073 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Mon, 13 Apr 2026 22:13:58 -0700 Subject: [PATCH 13/17] codex: use item done for realtime transcript done --- .../tests/suite/v2/realtime_conversation.rs | 9 ++++- .../endpoint/realtime_websocket/methods.rs | 29 +++++++++++++++ .../realtime_websocket/protocol_v2.rs | 37 +++++++++++++++++++ 3 files changed, 73 insertions(+), 2 deletions(-) diff --git a/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs b/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs index e3e3c8ac1a2..a08b0412869 100644 --- a/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs +++ b/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs @@ -732,8 +732,13 @@ async fn realtime_text_output_modality_requests_text_output_and_final_transcript "delta": "world" }), json!({ - "type": "response.output_text.done", - "text": "hello world" + "type": "conversation.item.done", + "item": { + "id": "item_output_1", + "type": "message", + "role": "assistant", + "content": [{"type": "output_text", "text": "hello world"}] + } }), ]]]) .await; diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs index 3b21a728202..4f2ec9c54c0 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs @@ -1136,6 +1136,35 @@ mod tests { ); } + #[test] + fn parse_realtime_v2_item_done_output_text_event() { + let payload = json!({ + "type": "conversation.item.done", + "item": { + "id": "item_output_1", + "type": "message", + "role": "assistant", + "content": [ + {"type": "output_text", "text": "hello"}, + {"type": "output_text", "text": " world"} + ] + } + }) + .to_string(); + + assert_eq!( + parse_realtime_event(payload.as_str(), RealtimeEventParser::RealtimeV2), + Some(RealtimeEvent::OutputTranscriptDone( + RealtimeTranscriptDone { + text: "hello world".to_string(), + item_id: Some("item_output_1".to_string()), + output_index: None, + content_index: None, + } + )) + ); + } + #[test] fn complete_transcript_entry_replaces_current_part_only() { let mut state = ActiveTranscriptState::default(); diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v2.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v2.rs index e09a60cb210..4cea1535d67 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v2.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v2.rs @@ -10,6 +10,7 @@ use codex_protocol::protocol::RealtimeInputAudioSpeechStarted; use codex_protocol::protocol::RealtimeResponseCancelled; use codex_protocol::protocol::RealtimeResponseCreated; use codex_protocol::protocol::RealtimeResponseDone; +use codex_protocol::protocol::RealtimeTranscriptDone; use serde_json::Map as JsonMap; use serde_json::Value; use tracing::debug; @@ -128,12 +129,48 @@ fn parse_conversation_item_done_event(parsed: &Value) -> Option { return Some(handoff); } + if let Some(transcript_done) = parse_item_done_transcript(item) { + return Some(transcript_done); + } + item.get("id") .and_then(Value::as_str) .map(str::to_string) .map(|item_id| RealtimeEvent::ConversationItemDone { item_id }) } +fn parse_item_done_transcript(item: &JsonMap) -> Option { + let role = item.get("role").and_then(Value::as_str)?; + let text = item + .get("content") + .and_then(Value::as_array)? + .iter() + .filter_map(item_content_text) + .collect::(); + if text.is_empty() { + return None; + } + + let done = RealtimeTranscriptDone { + text, + item_id: item.get("id").and_then(Value::as_str).map(str::to_string), + output_index: None, + content_index: None, + }; + match role { + "user" => Some(RealtimeEvent::InputTranscriptDone(done)), + "assistant" => Some(RealtimeEvent::OutputTranscriptDone(done)), + _ => None, + } +} + +fn item_content_text(content: &Value) -> Option<&str> { + content + .get("text") + .or_else(|| content.get("transcript")) + .and_then(Value::as_str) +} + fn parse_handoff_requested_event(item: &JsonMap) -> Option { let item_type = item.get("type").and_then(Value::as_str); let item_name = item.get("name").and_then(Value::as_str); From a4a812d27e3eb651720b43cdcde07ec7331f0a0d Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Mon, 13 Apr 2026 22:24:16 -0700 Subject: [PATCH 14/17] codex: remove realtime transcript correlation state --- .../endpoint/realtime_websocket/methods.rs | 329 +----------------- .../realtime_websocket/protocol_common.rs | 28 +- .../realtime_websocket/protocol_v2.rs | 7 +- codex-rs/protocol/src/protocol.rs | 30 +- 4 files changed, 18 insertions(+), 376 deletions(-) diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs index 4f2ec9c54c0..14fc775b6a0 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs @@ -18,7 +18,6 @@ use crate::provider::Provider; use codex_client::backoff; use codex_client::maybe_build_rustls_client_config_with_custom_ca; use codex_protocol::protocol::RealtimeTranscriptDelta; -use codex_protocol::protocol::RealtimeTranscriptDone; use codex_utils_rustls_provider::ensure_rustls_crypto_provider; use futures::SinkExt; use futures::StreamExt; @@ -213,26 +212,9 @@ pub struct RealtimeWebsocketEvents { is_closed: Arc, } -#[derive(Debug, Default, PartialEq, Eq)] +#[derive(Default)] struct ActiveTranscriptState { entries: Vec, - in_progress_parts: Vec, -} - -#[derive(Debug, PartialEq, Eq)] -struct ActiveTranscriptPart { - key: TranscriptPartKey, - entry_index: usize, - start: usize, - end: usize, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -struct TranscriptPartKey { - role: String, - item_id: Option, - output_index: Option, - content_index: Option, } impl RealtimeWebsocketConnection { @@ -432,25 +414,20 @@ impl RealtimeWebsocketEvents { let mut active_transcript = self.active_transcript.lock().await; match event { RealtimeEvent::InputAudioSpeechStarted(_) => {} - RealtimeEvent::InputTranscriptDelta(update) => { - append_transcript_delta(&mut active_transcript, "user", update); - } - RealtimeEvent::InputTranscriptDone(update) => { - complete_transcript_entry(&mut active_transcript, "user", update); - } - RealtimeEvent::OutputTranscriptDelta(update) => { - append_transcript_delta(&mut active_transcript, "assistant", update); + RealtimeEvent::InputTranscriptDelta(RealtimeTranscriptDelta { delta, .. }) => { + append_transcript_delta(&mut active_transcript.entries, "user", delta); } - RealtimeEvent::OutputTranscriptDone(update) => { - complete_transcript_entry(&mut active_transcript, "assistant", update); + RealtimeEvent::OutputTranscriptDelta(RealtimeTranscriptDelta { delta, .. }) => { + append_transcript_delta(&mut active_transcript.entries, "assistant", delta); } RealtimeEvent::HandoffRequested(handoff) => { if self.event_parser == RealtimeEventParser::V1 { handoff.active_transcript = std::mem::take(&mut active_transcript.entries); - active_transcript.in_progress_parts.clear(); } } RealtimeEvent::SessionUpdated { .. } + | RealtimeEvent::InputTranscriptDone(_) + | RealtimeEvent::OutputTranscriptDone(_) | RealtimeEvent::AudioOut(_) | RealtimeEvent::ResponseCreated(_) | RealtimeEvent::ResponseCancelled(_) @@ -462,182 +439,24 @@ impl RealtimeWebsocketEvents { } } -fn append_transcript_delta( - state: &mut ActiveTranscriptState, - role: &str, - update: &RealtimeTranscriptDelta, -) { - if update.delta.is_empty() { +fn append_transcript_delta(entries: &mut Vec, role: &str, delta: &str) { + if delta.is_empty() { return; } - let key = transcript_part_key( - role, - update.item_id.as_deref(), - update.output_index, - update.content_index, - ); - let key_has_metadata = transcript_part_key_has_metadata(&key); - - if let Some(part_index) = state - .in_progress_parts - .iter() - .position(|part| part.key == key) - { - let entry_index = state.in_progress_parts[part_index].entry_index; - if state - .entries - .get(entry_index) - .is_some_and(|entry| entry.role == role) - && (key_has_metadata || entry_index == state.entries.len().saturating_sub(1)) - { - let insert_at = state.in_progress_parts[part_index].end; - state.entries[entry_index] - .text - .insert_str(insert_at, &update.delta); - adjust_transcript_part_offsets( - state, - entry_index, - insert_at, - isize::try_from(update.delta.len()).unwrap_or(isize::MAX), - ); - state.in_progress_parts[part_index].end += update.delta.len(); - return; - } - - state.in_progress_parts.swap_remove(part_index); - } - - let entry_index = transcript_entry_index(&mut state.entries, role); - let start = state.entries[entry_index].text.len(); - state.entries[entry_index].text.push_str(&update.delta); - - state.in_progress_parts.push(ActiveTranscriptPart { - key, - entry_index, - start, - end: start + update.delta.len(), - }); -} - -fn complete_transcript_entry( - state: &mut ActiveTranscriptState, - role: &str, - update: &RealtimeTranscriptDone, -) { - let key = transcript_part_key( - role, - update.item_id.as_deref(), - update.output_index, - update.content_index, - ); - let key_has_metadata = transcript_part_key_has_metadata(&key); - let part_index = state - .in_progress_parts - .iter() - .position(|part| part.key == key); - - if let Some(part_index) = part_index { - let part = state.in_progress_parts.swap_remove(part_index); - let old_len = part.end.saturating_sub(part.start); - let entry_is_current = - key_has_metadata || part.entry_index == state.entries.len().saturating_sub(1); - let replaced = if entry_is_current - && let Some(entry) = state.entries.get_mut(part.entry_index) - && entry.role == role - && part.end <= entry.text.len() - { - // Done events carry the complete current part, not the complete - // message, so only replace the range accumulated for that part. - entry.text.replace_range(part.start..part.end, &update.text); - true - } else { - false - }; - - if replaced { - let length_delta = isize::try_from(update.text.len()).unwrap_or(isize::MAX) - - isize::try_from(old_len).unwrap_or(isize::MAX); - adjust_transcript_part_offsets(state, part.entry_index, part.end, length_delta); - return; - } - } - - let text = &update.text; - if text.is_empty() { - return; - } - - if let Some(last_entry) = state.entries.last_mut() + if let Some(last_entry) = entries.last_mut() && last_entry.role == role { - last_entry.text.push_str(text); + last_entry.text.push_str(delta); return; } - state.entries.push(RealtimeTranscriptEntry { + entries.push(RealtimeTranscriptEntry { role: role.to_string(), - text: text.to_string(), + text: delta.to_string(), }); } -fn transcript_part_key( - role: &str, - item_id: Option<&str>, - output_index: Option, - content_index: Option, -) -> TranscriptPartKey { - TranscriptPartKey { - role: role.to_string(), - item_id: item_id.map(str::to_string), - output_index, - content_index, - } -} - -fn transcript_part_key_has_metadata(key: &TranscriptPartKey) -> bool { - key.item_id.is_some() || key.output_index.is_some() || key.content_index.is_some() -} - -fn transcript_entry_index(entries: &mut Vec, role: &str) -> usize { - match entries.last() { - Some(last_entry) if last_entry.role == role => entries.len() - 1, - _ => { - entries.push(RealtimeTranscriptEntry { - role: role.to_string(), - text: String::new(), - }); - entries.len() - 1 - } - } -} - -fn adjust_transcript_part_offsets( - state: &mut ActiveTranscriptState, - entry_index: usize, - after: usize, - amount: isize, -) { - if amount == 0 { - return; - } - - for part in &mut state.in_progress_parts { - if part.entry_index == entry_index && part.start >= after { - adjust_offset(&mut part.start, amount); - adjust_offset(&mut part.end, amount); - } - } -} - -fn adjust_offset(offset: &mut usize, amount: isize) { - if amount.is_positive() { - *offset += amount.unsigned_abs(); - } else { - *offset -= amount.unsigned_abs(); - } -} - pub struct RealtimeWebsocketClient { provider: Provider, } @@ -1039,7 +858,6 @@ mod tests { Some(RealtimeEvent::InputTranscriptDelta( RealtimeTranscriptDelta { delta: "hello ".to_string(), - ..Default::default() } )) ); @@ -1058,7 +876,6 @@ mod tests { Some(RealtimeEvent::OutputTranscriptDelta( RealtimeTranscriptDelta { delta: "hi".to_string(), - ..Default::default() } )) ); @@ -1104,9 +921,6 @@ mod tests { Some(RealtimeEvent::InputTranscriptDelta( RealtimeTranscriptDelta { delta: "hello".to_string(), - item_id: Some("item_input_1".to_string()), - output_index: None, - content_index: Some(0), } )) ); @@ -1128,9 +942,6 @@ mod tests { Some(RealtimeEvent::OutputTranscriptDone( RealtimeTranscriptDone { text: "all done".to_string(), - item_id: Some("item_output_1".to_string()), - output_index: Some(0), - content_index: Some(1), } )) ); @@ -1157,122 +968,11 @@ mod tests { Some(RealtimeEvent::OutputTranscriptDone( RealtimeTranscriptDone { text: "hello world".to_string(), - item_id: Some("item_output_1".to_string()), - output_index: None, - content_index: None, } )) ); } - #[test] - fn complete_transcript_entry_replaces_current_part_only() { - let mut state = ActiveTranscriptState::default(); - let first_part_delta = RealtimeTranscriptDelta { - delta: "hello".to_string(), - item_id: Some("item_output_1".to_string()), - output_index: Some(0), - content_index: Some(0), - }; - let second_part_delta = RealtimeTranscriptDelta { - delta: "beta".to_string(), - item_id: Some("item_output_1".to_string()), - output_index: Some(0), - content_index: Some(1), - }; - let first_part_done = RealtimeTranscriptDone { - text: "hello!".to_string(), - item_id: Some("item_output_1".to_string()), - output_index: Some(0), - content_index: Some(0), - }; - let second_part_done = RealtimeTranscriptDone { - text: " beta".to_string(), - item_id: Some("item_output_1".to_string()), - output_index: Some(0), - content_index: Some(1), - }; - - append_transcript_delta(&mut state, "assistant", &first_part_delta); - append_transcript_delta(&mut state, "assistant", &second_part_delta); - complete_transcript_entry(&mut state, "assistant", &first_part_done); - complete_transcript_entry(&mut state, "assistant", &second_part_done); - - assert_eq!( - state, - ActiveTranscriptState { - entries: vec![RealtimeTranscriptEntry { - role: "assistant".to_string(), - text: "hello! beta".to_string(), - }], - in_progress_parts: Vec::new(), - } - ); - } - - #[test] - fn unkeyed_transcript_deltas_start_new_entry_after_role_change() { - let mut state = ActiveTranscriptState::default(); - let assistant_context = RealtimeTranscriptDelta { - delta: "assistant context".to_string(), - ..Default::default() - }; - let delegated_query = RealtimeTranscriptDelta { - delta: "delegated query".to_string(), - ..Default::default() - }; - let assist_confirm = RealtimeTranscriptDelta { - delta: "assist confirm".to_string(), - ..Default::default() - }; - - append_transcript_delta(&mut state, "assistant", &assistant_context); - append_transcript_delta(&mut state, "user", &delegated_query); - append_transcript_delta(&mut state, "assistant", &assist_confirm); - - assert_eq!( - state, - ActiveTranscriptState { - entries: vec![ - RealtimeTranscriptEntry { - role: "assistant".to_string(), - text: "assistant context".to_string(), - }, - RealtimeTranscriptEntry { - role: "user".to_string(), - text: "delegated query".to_string(), - }, - RealtimeTranscriptEntry { - role: "assistant".to_string(), - text: "assist confirm".to_string(), - }, - ], - in_progress_parts: vec![ - ActiveTranscriptPart { - key: transcript_part_key( - "user", /*item_id*/ None, /*output_index*/ None, - /*content_index*/ None, - ), - entry_index: 1, - start: 0, - end: "delegated query".len(), - }, - ActiveTranscriptPart { - key: transcript_part_key( - "assistant", - /*item_id*/ None, - /*output_index*/ None, - /*content_index*/ None, - ), - entry_index: 2, - start: 0, - end: "assist confirm".len(), - }, - ], - } - ); - } - #[test] fn parse_realtime_v2_output_audio_delta_defaults_audio_shape() { let payload = json!({ @@ -1808,7 +1508,6 @@ mod tests { input_delta_event, RealtimeEvent::InputTranscriptDelta(RealtimeTranscriptDelta { delta: "delegate ".to_string(), - ..Default::default() }) ); @@ -1821,7 +1520,6 @@ mod tests { input_delta_event, RealtimeEvent::InputTranscriptDelta(RealtimeTranscriptDelta { delta: "now".to_string(), - ..Default::default() }) ); @@ -1834,7 +1532,6 @@ mod tests { output_delta_event, RealtimeEvent::OutputTranscriptDelta(RealtimeTranscriptDelta { delta: "working".to_string(), - ..Default::default() }) ); diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_common.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_common.rs index 00513dc84e8..c89c5ea4d05 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_common.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_common.rs @@ -51,12 +51,7 @@ pub(super) fn parse_transcript_delta_event( .get(field) .and_then(Value::as_str) .map(str::to_string) - .map(|delta| RealtimeTranscriptDelta { - delta, - item_id: parse_string_field(parsed, "item_id"), - output_index: parse_u32_field(parsed, "output_index"), - content_index: parse_u32_field(parsed, "content_index"), - }) + .map(|delta| RealtimeTranscriptDelta { delta }) } pub(super) fn parse_transcript_done_event( @@ -67,26 +62,7 @@ pub(super) fn parse_transcript_done_event( .get(field) .and_then(Value::as_str) .map(str::to_string) - .map(|text| RealtimeTranscriptDone { - text, - item_id: parse_string_field(parsed, "item_id"), - output_index: parse_u32_field(parsed, "output_index"), - content_index: parse_u32_field(parsed, "content_index"), - }) -} - -fn parse_string_field(parsed: &Value, field: &str) -> Option { - parsed - .get(field) - .and_then(Value::as_str) - .map(str::to_string) -} - -fn parse_u32_field(parsed: &Value, field: &str) -> Option { - parsed - .get(field) - .and_then(Value::as_u64) - .and_then(|value| u32::try_from(value).ok()) + .map(|text| RealtimeTranscriptDone { text }) } pub(super) fn parse_error_event(parsed: &Value) -> Option { diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v2.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v2.rs index 4cea1535d67..7c1c8d8b879 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v2.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v2.rs @@ -151,12 +151,7 @@ fn parse_item_done_transcript(item: &JsonMap) -> Option Some(RealtimeEvent::InputTranscriptDone(done)), "assistant" => Some(RealtimeEvent::OutputTranscriptDone(done)), diff --git a/codex-rs/protocol/src/protocol.rs b/codex-rs/protocol/src/protocol.rs index 57e17a287bd..7431e5f1230 100644 --- a/codex-rs/protocol/src/protocol.rs +++ b/codex-rs/protocol/src/protocol.rs @@ -294,40 +294,14 @@ pub struct RealtimeAudioFrame { pub item_id: Option, } -#[derive(Debug, Clone, Default, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] pub struct RealtimeTranscriptDelta { pub delta: String, - /// Internal key material used to reconcile streamed transcript parts. - #[serde(skip)] - #[schemars(skip)] - #[ts(skip)] - pub item_id: Option, - #[serde(skip)] - #[schemars(skip)] - #[ts(skip)] - pub output_index: Option, - #[serde(skip)] - #[schemars(skip)] - #[ts(skip)] - pub content_index: Option, } -#[derive(Debug, Clone, Default, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] pub struct RealtimeTranscriptDone { pub text: String, - /// Internal key material used to reconcile streamed transcript parts. - #[serde(skip)] - #[schemars(skip)] - #[ts(skip)] - pub item_id: Option, - #[serde(skip)] - #[schemars(skip)] - #[ts(skip)] - pub output_index: Option, - #[serde(skip)] - #[schemars(skip)] - #[ts(skip)] - pub content_index: Option, } #[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, JsonSchema, TS)] From 1000d90e6b68841577c0101615fad8babb691759 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Mon, 13 Apr 2026 22:31:23 -0700 Subject: [PATCH 15/17] codex: avoid duplicate realtime text done events --- .../endpoint/realtime_websocket/methods.rs | 21 ------------------- .../realtime_websocket/protocol_v2.rs | 3 --- 2 files changed, 24 deletions(-) diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs index 14fc775b6a0..b1fd1ef75cc 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/methods.rs @@ -926,27 +926,6 @@ mod tests { ); } - #[test] - fn parse_realtime_v2_output_text_done_event() { - let payload = json!({ - "type": "response.output_text.done", - "item_id": "item_output_1", - "output_index": 0, - "content_index": 1, - "text": "all done" - }) - .to_string(); - - assert_eq!( - parse_realtime_event(payload.as_str(), RealtimeEventParser::RealtimeV2), - Some(RealtimeEvent::OutputTranscriptDone( - RealtimeTranscriptDone { - text: "all done".to_string(), - } - )) - ); - } - #[test] fn parse_realtime_v2_item_done_output_text_event() { let payload = json!({ diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v2.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v2.rs index 7c1c8d8b879..344d7b86dd3 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v2.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v2.rs @@ -38,9 +38,6 @@ pub(super) fn parse_realtime_event_v2(payload: &str) -> Option { "response.output_text.delta" | "response.output_audio_transcript.delta" => { parse_transcript_delta_event(&parsed, "delta").map(RealtimeEvent::OutputTranscriptDelta) } - "response.output_text.done" => { - parse_transcript_done_event(&parsed, "text").map(RealtimeEvent::OutputTranscriptDone) - } "response.output_audio_transcript.done" => { parse_transcript_done_event(&parsed, "transcript") .map(RealtimeEvent::OutputTranscriptDone) From 52b504ef5ce392d3f0eba7d6bc04ca5e186c46ea Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Mon, 13 Apr 2026 23:01:04 -0700 Subject: [PATCH 16/17] codex: avoid duplicate realtime audio transcript done --- .../tests/suite/v2/realtime_conversation.rs | 13 +++++++++++++ .../src/endpoint/realtime_websocket/protocol_v2.rs | 4 ---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs b/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs index a08b0412869..fb8a369bf55 100644 --- a/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs +++ b/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs @@ -731,6 +731,10 @@ async fn realtime_text_output_modality_requests_text_output_and_final_transcript "type": "response.output_text.delta", "delta": "world" }), + json!({ + "type": "response.output_audio_transcript.done", + "transcript": "hello world" + }), json!({ "type": "conversation.item.done", "item": { @@ -829,6 +833,15 @@ async fn realtime_text_output_modality_requests_text_output_and_final_transcript text: "hello world".to_string(), } ); + assert!( + timeout( + Duration::from_millis(200), + mcp.read_stream_until_notification_message("thread/realtime/transcript/done"), + ) + .await + .is_err(), + "should not emit duplicate transcript done from audio transcript done" + ); realtime_server.shutdown().await; Ok(()) diff --git a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v2.rs b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v2.rs index 344d7b86dd3..559e83426bb 100644 --- a/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v2.rs +++ b/codex-rs/codex-api/src/endpoint/realtime_websocket/protocol_v2.rs @@ -38,10 +38,6 @@ pub(super) fn parse_realtime_event_v2(payload: &str) -> Option { "response.output_text.delta" | "response.output_audio_transcript.delta" => { parse_transcript_delta_event(&parsed, "delta").map(RealtimeEvent::OutputTranscriptDelta) } - "response.output_audio_transcript.done" => { - parse_transcript_done_event(&parsed, "transcript") - .map(RealtimeEvent::OutputTranscriptDone) - } "input_audio_buffer.speech_started" => Some(RealtimeEvent::InputAudioSpeechStarted( RealtimeInputAudioSpeechStarted { item_id: parsed From a14eabb809a1e81b449aae2e2acdf0a472af229d Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim Date: Mon, 13 Apr 2026 23:23:32 -0700 Subject: [PATCH 17/17] codex: fix realtime transcript done fixture --- .../app-server/tests/suite/v2/realtime_conversation.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs b/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs index fb8a369bf55..ab593381b22 100644 --- a/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs +++ b/codex-rs/app-server/tests/suite/v2/realtime_conversation.rs @@ -482,8 +482,13 @@ async fn realtime_conversation_streams_v2_notifications() -> Result<()> { "delta": "working" }), json!({ - "type": "response.output_text.done", - "text": "working on it" + "type": "conversation.item.done", + "item": { + "id": "item_assistant_1", + "type": "message", + "role": "assistant", + "content": [{ "type": "output_text", "text": "working on it" }] + } }), json!({ "type": "conversation.item.done",