diff --git a/codex-rs/Cargo.lock b/codex-rs/Cargo.lock index 25c12ac154c3..10d63dcfce50 100644 --- a/codex-rs/Cargo.lock +++ b/codex-rs/Cargo.lock @@ -2802,6 +2802,18 @@ dependencies = [ "uuid", ] +[[package]] +name = "codex-rollout-trace" +version = "0.0.0" +dependencies = [ + "anyhow", + "codex-protocol", + "pretty_assertions", + "serde", + "serde_json", + "tempfile", +] + [[package]] name = "codex-sandboxing" version = "0.0.0" diff --git a/codex-rs/Cargo.toml b/codex-rs/Cargo.toml index 236412051177..18ed22f283a7 100644 --- a/codex-rs/Cargo.toml +++ b/codex-rs/Cargo.toml @@ -52,6 +52,7 @@ members = [ "protocol", "realtime-webrtc", "rollout", + "rollout-trace", "rmcp-client", "responses-api-proxy", "response-debug-context", diff --git a/codex-rs/rollout-trace/BUILD.bazel b/codex-rs/rollout-trace/BUILD.bazel new file mode 100644 index 000000000000..730f7de4c920 --- /dev/null +++ b/codex-rs/rollout-trace/BUILD.bazel @@ -0,0 +1,6 @@ +load("//:defs.bzl", "codex_rust_crate") + +codex_rust_crate( + name = "rollout-trace", + crate_name = "codex_rollout_trace", +) diff --git a/codex-rs/rollout-trace/Cargo.toml b/codex-rs/rollout-trace/Cargo.toml new file mode 100644 index 000000000000..e67b35953a5b --- /dev/null +++ b/codex-rs/rollout-trace/Cargo.toml @@ -0,0 +1,23 @@ +[package] +edition.workspace = true +license.workspace = true +name = "codex-rollout-trace" +version.workspace = true + +[lib] +doctest = false +name = "codex_rollout_trace" +path = "src/lib.rs" + +[lints] +workspace = true + +[dependencies] +anyhow = { workspace = true } +codex-protocol = { workspace = true } +serde = { workspace = true, features = ["derive"] } +serde_json = { workspace = true } + +[dev-dependencies] +pretty_assertions = { workspace = true } +tempfile = { workspace = true } diff --git a/codex-rs/rollout-trace/README.md b/codex-rs/rollout-trace/README.md new file mode 100644 index 000000000000..693401f900fd --- /dev/null +++ b/codex-rs/rollout-trace/README.md @@ -0,0 +1,203 @@ +# Rollout Trace + +> **Privacy:** Rollout tracing does **not** collect, upload, or report user data; +> it only writes local bundles when `CODEX_ROLLOUT_TRACE_ROOT` is set. + +Rollout tracing is an opt-in diagnostic path for understanding what happened +during a Codex session. It records raw runtime evidence into a local bundle, then +replays that bundle into a semantic graph that a debugger or UI can inspect. + +The key design choice is: **observe first, interpret later**. + +Hot-path Codex code does not try to build the final graph while the session is +running. It writes ordered raw events and payload references. The offline reducer +then decides which events became model-visible conversation, which events were +runtime work, and how information moved between threads, tools, code cells, and +terminal sessions. + +## What This Gives Us + +Rollout traces make failures debuggable when the normal transcript is not enough. +They preserve enough evidence to answer questions like: + +- Which model request produced this tool call? +- Did this output come from the model-visible transcript, a code-mode runtime + value, a terminal operation, or an agent notification? +- Which code-mode `exec` cell issued a nested tool call? +- Which terminal operation created or reused a running process? +- Which multi-agent v2 tool call spawned, messaged, received from, or closed a + child thread? + +The reduced `state.json` is intentionally not just a transcript. It is a graph of +model-visible conversation plus the runtime objects that explain how Codex got +there. + +## System Shape + +```mermaid +flowchart TD + subgraph Runtime["codex-core runtime"] + Protocol["protocol lifecycle\nthread start/end, turn start/end"] + Inference["inference + compaction\nrequests, responses, checkpoints"] + Tools["tool dispatch\ndirect model tools + code-mode nested tools"] + CodeMode["code-mode runtime\nexec cells, yields, waits, termination"] + Terminal["terminal runtime\nexec_command / write_stdin operations"] + Agents["multi_agent_v2\nspawn, task delivery, result, close"] + end + + Recorder["RolloutTraceRecorder\nthin best-effort producer"] + Writer["TraceWriter\nassigns seq and writes payloads before events"] + + subgraph Bundle["trace bundle"] + Manifest["manifest.json\ntrace_id, rollout_id, root_thread_id"] + Events["trace.jsonl\nordered raw event spine"] + Payloads["payloads/*.json\nlarge raw evidence"] + end + + Reducer["replay_bundle\ndeterministic offline reducer"] + + subgraph State["state.json"] + Threads["threads + turns"] + Conversation["conversation_items\nwhat the model saw"] + RuntimeObjects["inference_calls, tool_calls,\ncode_cells, terminals, compactions"] + Edges["interaction_edges\nspawn, task, result, close"] + RawRefs["raw_payload refs"] + end + + Protocol --> Recorder + Inference --> Recorder + Tools --> Recorder + CodeMode --> Recorder + Terminal --> Recorder + Agents --> Recorder + + Recorder --> Writer + Writer --> Manifest + Writer --> Payloads + Writer --> Events + + Manifest --> Reducer + Events --> Reducer + Payloads --> Reducer + + Reducer --> Threads + Reducer --> Conversation + Reducer --> RuntimeObjects + Reducer --> Edges + Reducer --> RawRefs +``` + +The recorder is deliberately small. It is enabled by `CODEX_ROLLOUT_TRACE_ROOT` +and must never make a Codex session fail just because tracing failed. Core emits +raw observations; this crate owns the bundle schema, writer API, and reducer. + +## Bundle Layout + +A trace bundle contains: + +- `manifest.json`: trace identity and bundle metadata. +- `trace.jsonl`: append-only raw events ordered by writer-assigned `seq`. +- `payloads/*.json`: raw requests, responses, tool inputs/results, runtime + events, terminal output, compaction data, and protocol snapshots. +- `state.json`: optional reducer output written by `codex debug trace-reduce`. + +`trace_id` identifies this diagnostic artifact. `rollout_id` identifies the +Codex rollout/session being observed. Keeping those separate lets us reason about +the stored trace without confusing it with the product-level session identity. + +To reduce a bundle: + +```bash +codex debug trace-reduce +``` + +By default this writes `/state.json`. + +## Raw Evidence vs Reduced Graph + +```mermaid +flowchart LR + Model["model-visible payloads\nrequests and response output items"] + Runtime["runtime observations\ntool dispatch, terminal output, code-mode JSON"] + RawPayloads["payloads/*.json\nexact evidence"] + Reducer["reducer"] + Conversation["ConversationItem\nwhat the model saw"] + ToolCall["ToolCall\nruntime tool boundary"] + CodeCell["CodeCell\nmodel-authored exec cell"] + TerminalOperation["TerminalOperation\ncommand/write/poll"] + InteractionEdge["InteractionEdge\ninformation flow"] + + Model --> RawPayloads + Runtime --> RawPayloads + RawPayloads --> Reducer + + Reducer --> Conversation + Reducer --> ToolCall + Reducer --> CodeCell + Reducer --> TerminalOperation + Reducer --> InteractionEdge + + CodeCell --> ToolCall + ToolCall --> TerminalOperation + ToolCall --> InteractionEdge + Conversation --> InteractionEdge +``` + +This distinction is the reason the model has both raw payload references and +semantic objects. A code-mode nested tool call, for example, has JSON input and +output at the JavaScript runtime boundary, but the model-visible transcript only +contains the surrounding `exec` custom tool call and its eventual output. + +The reducer keeps those facts separate: + +- `ConversationItem` records what appeared in model-facing requests/responses. +- `ToolCall`, `CodeCell`, `TerminalOperation`, `InferenceCall`, and + `Compaction` record runtime/debug boundaries. +- `InteractionEdge` records information flow between objects, such as a + `spawn_agent` tool call delivering a task into a child thread. +- `RawPayloadRef` points back to exact evidence when a viewer needs more detail + than the reduced graph stores inline. + +## Multi-Agent v2 + +Multi-agent v2 child threads share the root trace writer. That means one root +bundle reduces into one graph containing the parent thread, child threads, and +the edges between them. + +```mermaid +flowchart LR + RootTool["root ToolCall\nspawn_agent / followup_task / send_message"] + ChildInput["child ConversationItem\ninjected task/message"] + ChildThread["child AgentThread"] + ChildResult["child assistant ConversationItem\nresult message"] + RootNotice["root ConversationItem\nsubagent notification"] + CloseTool["root ToolCall\nclose_agent"] + TargetThread["target AgentThread"] + + RootTool -- "spawn/task edge" --> ChildInput + ChildInput --> ChildThread + ChildThread --> ChildResult + ChildResult -- "agent_result edge" --> RootNotice + CloseTool -- "close_agent edge" --> TargetThread +``` + +Top-level independent threads still get independent bundles. Spawned child +threads are different: they are part of the same rollout tree, so they belong in +the same raw event log, payload directory, and reduced `state.json`. + +## Reducer Invariants + +The reducer is strict where the raw evidence should be self-consistent: + +- raw events are replayed in `seq` order; +- payload files must exist before events refer to them; +- reduced object IDs are stable within one replay; +- runtime events may be queued until the model-visible source or delivery target + has been observed; +- model-visible conversation is derived from model-facing payloads, not from + runtime convenience output; +- runtime payloads are evidence, not proof that the model saw the same bytes. + +Those invariants let the reduced graph stay small while preserving a path back +to the original evidence whenever a debugger needs to explain why an object or +edge exists. diff --git a/codex-rs/rollout-trace/src/bundle.rs b/codex-rs/rollout-trace/src/bundle.rs new file mode 100644 index 000000000000..8f67da3b3bfa --- /dev/null +++ b/codex-rs/rollout-trace/src/bundle.rs @@ -0,0 +1,49 @@ +//! Trace bundle manifest and local layout constants. + +use serde::Deserialize; +use serde::Serialize; + +use crate::model::AgentThreadId; + +pub(crate) const MANIFEST_FILE_NAME: &str = "manifest.json"; +pub(crate) const RAW_EVENT_LOG_FILE_NAME: &str = "trace.jsonl"; +pub(crate) const PAYLOADS_DIR_NAME: &str = "payloads"; +/// Conventional file name for a reducer-written `RolloutTrace` cache. +pub const REDUCED_STATE_FILE_NAME: &str = "state.json"; +pub(crate) const TRACE_MANIFEST_SCHEMA_VERSION: u32 = 1; +pub(crate) const REDUCED_TRACE_SCHEMA_VERSION: u32 = 1; + +/// Manifest stored at the root of a trace bundle. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub(crate) struct TraceBundleManifest { + pub(crate) schema_version: u32, + pub(crate) trace_id: String, + pub(crate) rollout_id: String, + /// Root thread for the recorded rollout. Replay should fail rather than + /// inventing a placeholder, because every reduced object is scoped back to + /// this thread tree. + pub(crate) root_thread_id: AgentThreadId, + pub(crate) started_at_unix_ms: i64, + pub(crate) raw_event_log: String, + pub(crate) payloads_dir: String, +} + +impl TraceBundleManifest { + /// Builds a manifest that uses the standard local bundle layout. + pub(crate) fn new( + trace_id: String, + rollout_id: String, + root_thread_id: AgentThreadId, + started_at_unix_ms: i64, + ) -> Self { + Self { + schema_version: TRACE_MANIFEST_SCHEMA_VERSION, + trace_id, + rollout_id, + root_thread_id, + started_at_unix_ms, + raw_event_log: RAW_EVENT_LOG_FILE_NAME.to_string(), + payloads_dir: PAYLOADS_DIR_NAME.to_string(), + } + } +} diff --git a/codex-rs/rollout-trace/src/compaction.rs b/codex-rs/rollout-trace/src/compaction.rs new file mode 100644 index 000000000000..6464f5781fef --- /dev/null +++ b/codex-rs/rollout-trace/src/compaction.rs @@ -0,0 +1,225 @@ +//! Hot-path helpers for recording upstream remote compaction attempts. +//! +//! Remote compaction is a model-facing request with a different semantic role +//! from normal sampling. Keeping the no-op capable trace handle in this crate +//! lets `codex-core` record exact endpoint payloads without owning trace schema +//! details. + +use std::fmt::Display; +use std::sync::Arc; +use std::sync::atomic::AtomicU64; +use std::sync::atomic::Ordering; + +use codex_protocol::models::ResponseItem; +use serde::Serialize; +use serde_json::Value as JsonValue; + +use crate::inference::trace_response_item_json; +use crate::model::AgentThreadId; +use crate::model::CodexTurnId; +use crate::model::CompactionId; +use crate::model::CompactionRequestId; +use crate::payload::RawPayloadKind; +use crate::raw_event::RawTraceEventContext; +use crate::raw_event::RawTraceEventPayload; +use crate::writer::TraceWriter; + +static NEXT_COMPACTION_REQUEST: AtomicU64 = AtomicU64::new(1); + +/// Turn-local remote compaction tracing context. +/// +/// A compaction can retry its upstream request before installing one checkpoint. The context +/// owns the stable checkpoint ID; each request attempt gets a separate request ID. +#[derive(Clone, Debug)] +pub struct CompactionTraceContext { + state: CompactionTraceContextState, +} + +#[derive(Clone, Debug)] +enum CompactionTraceContextState { + Disabled, + Enabled(EnabledCompactionTraceContext), +} + +#[derive(Clone, Debug)] +struct EnabledCompactionTraceContext { + writer: Arc, + thread_id: AgentThreadId, + codex_turn_id: CodexTurnId, + compaction_id: CompactionId, + model: String, + provider_name: String, +} + +/// One upstream request attempt made while computing a compaction checkpoint. +#[derive(Clone, Debug)] +pub struct CompactionTraceAttempt { + state: CompactionTraceAttemptState, +} + +#[derive(Clone, Debug)] +enum CompactionTraceAttemptState { + Disabled, + Enabled(EnabledCompactionTraceAttempt), +} + +#[derive(Clone, Debug)] +struct EnabledCompactionTraceAttempt { + context: EnabledCompactionTraceContext, + compaction_request_id: CompactionRequestId, +} + +#[derive(Serialize)] +struct TracedCompactionCompleted { + output_items: Vec, +} + +impl CompactionTraceContext { + /// Builds a context that accepts trace calls and records nothing. + pub fn disabled() -> Self { + Self { + state: CompactionTraceContextState::Disabled, + } + } + + /// Builds an enabled context for upstream attempts that compute one checkpoint. + pub fn enabled( + writer: Arc, + thread_id: AgentThreadId, + codex_turn_id: CodexTurnId, + compaction_id: CompactionId, + model: String, + provider_name: String, + ) -> Self { + Self { + state: CompactionTraceContextState::Enabled(EnabledCompactionTraceContext { + writer, + thread_id, + codex_turn_id, + compaction_id, + model, + provider_name, + }), + } + } + + /// Starts a new upstream attempt and records the exact compact endpoint request. + pub fn start_attempt(&self, request: &impl Serialize) -> CompactionTraceAttempt { + let CompactionTraceContextState::Enabled(context) = &self.state else { + return CompactionTraceAttempt::disabled(); + }; + + let attempt = CompactionTraceAttempt { + state: CompactionTraceAttemptState::Enabled(EnabledCompactionTraceAttempt { + context: context.clone(), + compaction_request_id: next_compaction_request_id(), + }), + }; + attempt.record_started(request); + attempt + } +} + +impl CompactionTraceAttempt { + /// Builds an attempt that records nothing. + fn disabled() -> Self { + Self { + state: CompactionTraceAttemptState::Disabled, + } + } + + fn record_started(&self, request: &impl Serialize) { + let CompactionTraceAttemptState::Enabled(attempt) = &self.state else { + return; + }; + let Some(request_payload) = write_json_payload_best_effort( + &attempt.context.writer, + RawPayloadKind::CompactionRequest, + request, + ) else { + return; + }; + + append_with_context_best_effort( + &attempt.context, + RawTraceEventPayload::CompactionRequestStarted { + compaction_id: attempt.context.compaction_id.clone(), + compaction_request_id: attempt.compaction_request_id.clone(), + thread_id: attempt.context.thread_id.clone(), + codex_turn_id: attempt.context.codex_turn_id.clone(), + model: attempt.context.model.clone(), + provider_name: attempt.context.provider_name.clone(), + request_payload, + }, + ); + } + + /// Records the non-streaming compact endpoint response payload. + /// + /// Compaction responses use the same response-item preservation rules as + /// inference streams: traces are evidence, while normal ResponseItem + /// serialization is shaped for future request construction. + pub fn record_completed(&self, output_items: &[ResponseItem]) { + let response_payload = TracedCompactionCompleted { + output_items: output_items.iter().map(trace_response_item_json).collect(), + }; + let CompactionTraceAttemptState::Enabled(attempt) = &self.state else { + return; + }; + let Some(response_payload) = write_json_payload_best_effort( + &attempt.context.writer, + RawPayloadKind::CompactionResponse, + &response_payload, + ) else { + return; + }; + + append_with_context_best_effort( + &attempt.context, + RawTraceEventPayload::CompactionRequestCompleted { + compaction_id: attempt.context.compaction_id.clone(), + compaction_request_id: attempt.compaction_request_id.clone(), + response_payload, + }, + ); + } + + /// Records pre-response failures from the compact endpoint. + pub fn record_failed(&self, error: impl Display) { + let CompactionTraceAttemptState::Enabled(attempt) = &self.state else { + return; + }; + append_with_context_best_effort( + &attempt.context, + RawTraceEventPayload::CompactionRequestFailed { + compaction_id: attempt.context.compaction_id.clone(), + compaction_request_id: attempt.compaction_request_id.clone(), + error: error.to_string(), + }, + ); + } +} + +fn next_compaction_request_id() -> CompactionRequestId { + let ordinal = NEXT_COMPACTION_REQUEST.fetch_add(1, Ordering::Relaxed); + format!("compaction_request:{ordinal}") +} + +fn write_json_payload_best_effort( + writer: &TraceWriter, + kind: RawPayloadKind, + payload: &impl Serialize, +) -> Option { + writer.write_json_payload(kind, payload).ok() +} + +fn append_with_context_best_effort( + context: &EnabledCompactionTraceContext, + payload: RawTraceEventPayload, +) { + let event_context = RawTraceEventContext { + thread_id: Some(context.thread_id.clone()), + codex_turn_id: Some(context.codex_turn_id.clone()), + }; + let _ = context.writer.append_with_context(event_context, payload); +} diff --git a/codex-rs/rollout-trace/src/inference.rs b/codex-rs/rollout-trace/src/inference.rs new file mode 100644 index 000000000000..506be96ad9c1 --- /dev/null +++ b/codex-rs/rollout-trace/src/inference.rs @@ -0,0 +1,369 @@ +//! Hot-path helpers for recording upstream inference attempts. +//! +//! The model client should not need to know whether rollout tracing is enabled. +//! A disabled context records nothing, which keeps one-shot HTTP calls, +//! WebSocket reuse, and retry/fallback attempts on the same code path. + +use std::fmt::Display; +use std::sync::Arc; +use std::sync::atomic::AtomicU64; +use std::sync::atomic::Ordering; + +use codex_protocol::models::ResponseItem; +use codex_protocol::protocol::TokenUsage; +use serde::Serialize; +use serde_json::Value as JsonValue; + +use crate::model::AgentThreadId; +use crate::model::CodexTurnId; +use crate::model::InferenceCallId; +use crate::payload::RawPayloadKind; +use crate::raw_event::RawTraceEventContext; +use crate::raw_event::RawTraceEventPayload; +use crate::writer::TraceWriter; + +static NEXT_INFERENCE_ATTEMPT: AtomicU64 = AtomicU64::new(1); + +/// Turn-local inference tracing context. +/// +/// This is intentionally a no-op capable handle instead of an `Option` at each +/// transport callsite. Whether tracing is enabled is a session concern; retry, +/// fallback, and stream mapping code should always be able to say what happened +/// without first branching on trace availability. +#[derive(Clone, Debug)] +pub struct InferenceTraceContext { + state: InferenceTraceContextState, +} + +#[derive(Clone, Debug)] +enum InferenceTraceContextState { + Disabled, + Enabled(EnabledInferenceTraceContext), +} + +#[derive(Clone, Debug)] +struct EnabledInferenceTraceContext { + writer: Arc, + thread_id: AgentThreadId, + codex_turn_id: CodexTurnId, + model: String, + provider_name: String, +} + +/// One concrete upstream request attempt. +/// +/// A Codex turn can create multiple attempts when auth recovery retries the +/// HTTP request or WebSocket setup falls back to HTTP. Completion is often +/// observed after the client returns the response stream, so attempts are +/// cloneable and self-contained. +#[derive(Clone, Debug)] +pub struct InferenceTraceAttempt { + state: InferenceTraceAttemptState, +} + +#[derive(Clone, Debug)] +enum InferenceTraceAttemptState { + Disabled, + Enabled(EnabledInferenceTraceAttempt), +} + +#[derive(Clone, Debug)] +struct EnabledInferenceTraceAttempt { + context: EnabledInferenceTraceContext, + inference_call_id: InferenceCallId, +} + +/// Non-delta response payload saved when a traced inference stream completes. +/// +/// We intentionally record completed output items instead of every stream delta +/// here. The raw stream can be added later as a separate payload class; this +/// response summary gives the reducer stable response identity, usage, and +/// model-visible output without duplicating high-volume text deltas. +#[derive(Serialize)] +struct TracedResponseStreamCompleted<'a> { + response_id: &'a str, + token_usage: &'a Option, + output_items: Vec, +} + +impl InferenceTraceContext { + /// Builds a context that accepts trace calls and records nothing. + pub fn disabled() -> Self { + Self { + state: InferenceTraceContextState::Disabled, + } + } + + /// Builds an enabled context for all upstream attempts made by one Codex turn. + pub fn enabled( + writer: Arc, + thread_id: AgentThreadId, + codex_turn_id: CodexTurnId, + model: String, + provider_name: String, + ) -> Self { + Self { + state: InferenceTraceContextState::Enabled(EnabledInferenceTraceContext { + writer, + thread_id, + codex_turn_id, + model, + provider_name, + }), + } + } + + /// Starts a new attempt after the concrete provider request has been built. + pub fn start_attempt(&self) -> InferenceTraceAttempt { + let InferenceTraceContextState::Enabled(context) = &self.state else { + return InferenceTraceAttempt::disabled(); + }; + + InferenceTraceAttempt { + state: InferenceTraceAttemptState::Enabled(EnabledInferenceTraceAttempt { + context: context.clone(), + inference_call_id: next_inference_call_id(), + }), + } + } +} + +impl InferenceTraceAttempt { + /// Builds an attempt that records nothing. + pub fn disabled() -> Self { + Self { + state: InferenceTraceAttemptState::Disabled, + } + } + + /// Records the exact request object about to be sent to the model provider. + pub fn record_started(&self, request: &impl Serialize) { + let InferenceTraceAttemptState::Enabled(attempt) = &self.state else { + return; + }; + let Some(request_payload) = write_json_payload_best_effort( + &attempt.context.writer, + RawPayloadKind::InferenceRequest, + request, + ) else { + return; + }; + + append_with_context_best_effort( + &attempt.context, + RawTraceEventPayload::InferenceStarted { + inference_call_id: attempt.inference_call_id.clone(), + thread_id: attempt.context.thread_id.clone(), + codex_turn_id: attempt.context.codex_turn_id.clone(), + model: attempt.context.model.clone(), + provider_name: attempt.context.provider_name.clone(), + request_payload, + }, + ); + } + + /// Records a bounded, non-streaming summary of the completed response stream. + /// + /// The caller passes protocol-native response items so this crate owns the + /// trace-specific serialization rules. That keeps codex-core focused on + /// transport behavior while preserving trace evidence that normal request + /// serialization intentionally omits. + pub fn record_completed( + &self, + response_id: &str, + token_usage: &Option, + output_items: &[ResponseItem], + ) { + let response_payload = TracedResponseStreamCompleted { + response_id, + token_usage, + output_items: output_items.iter().map(trace_response_item_json).collect(), + }; + let InferenceTraceAttemptState::Enabled(attempt) = &self.state else { + return; + }; + let Some(response_payload) = write_json_payload_best_effort( + &attempt.context.writer, + RawPayloadKind::InferenceResponse, + &response_payload, + ) else { + return; + }; + + append_with_context_best_effort( + &attempt.context, + RawTraceEventPayload::InferenceCompleted { + inference_call_id: attempt.inference_call_id.clone(), + response_id: Some(response_id.to_string()), + response_payload, + }, + ); + } + + /// Records pre-response and mid-stream failures. + pub fn record_failed(&self, error: impl Display) { + let InferenceTraceAttemptState::Enabled(attempt) = &self.state else { + return; + }; + append_with_context_best_effort( + &attempt.context, + RawTraceEventPayload::InferenceFailed { + inference_call_id: attempt.inference_call_id.clone(), + error: error.to_string(), + partial_response_payload: None, + }, + ); + } +} + +/// Serializes a response item for trace evidence rather than future request construction. +/// +/// The protocol serializer intentionally omits some readable reasoning content +/// when shaping items for later model requests. Rollout traces need the item as +/// Codex received it, so this helper restores that content in the raw payload. +pub(crate) fn trace_response_item_json(item: &ResponseItem) -> JsonValue { + let mut value = serde_json::to_value(item).unwrap_or_else(|err| { + serde_json::json!({ + "serialization_error": err.to_string(), + }) + }); + + if let ResponseItem::Reasoning { + content: Some(content), + .. + } = item + && let JsonValue::Object(object) = &mut value + { + object.insert( + "content".to_string(), + serde_json::to_value(content).unwrap_or_else(|err| { + serde_json::json!({ + "serialization_error": err.to_string(), + }) + }), + ); + } + + value +} + +fn next_inference_call_id() -> InferenceCallId { + let ordinal = NEXT_INFERENCE_ATTEMPT.fetch_add(1, Ordering::Relaxed); + format!("inference:{ordinal}") +} + +fn write_json_payload_best_effort( + writer: &TraceWriter, + kind: RawPayloadKind, + payload: &impl Serialize, +) -> Option { + writer.write_json_payload(kind, payload).ok() +} + +fn append_with_context_best_effort( + context: &EnabledInferenceTraceContext, + payload: RawTraceEventPayload, +) { + let event_context = RawTraceEventContext { + thread_id: Some(context.thread_id.clone()), + codex_turn_id: Some(context.codex_turn_id.clone()), + }; + let _ = context.writer.append_with_context(event_context, payload); +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use codex_protocol::models::ReasoningItemContent; + use codex_protocol::models::ReasoningItemReasoningSummary; + use pretty_assertions::assert_eq; + use serde_json::json; + use tempfile::TempDir; + + use super::*; + use crate::model::ExecutionStatus; + use crate::replay_bundle; + + #[test] + fn enabled_context_records_replayable_inference_attempt() -> anyhow::Result<()> { + let temp = TempDir::new()?; + let writer = Arc::new(TraceWriter::create( + temp.path(), + "trace-1".to_string(), + "rollout-1".to_string(), + "thread-root".to_string(), + )?); + writer.append(RawTraceEventPayload::ThreadStarted { + thread_id: "thread-root".to_string(), + agent_path: "/root".to_string(), + metadata_payload: None, + })?; + writer.append(RawTraceEventPayload::CodexTurnStarted { + codex_turn_id: "turn-1".to_string(), + thread_id: "thread-root".to_string(), + })?; + let context = InferenceTraceContext::enabled( + writer, + "thread-root".to_string(), + "turn-1".to_string(), + "gpt-test".to_string(), + "test-provider".to_string(), + ); + + let attempt = context.start_attempt(); + attempt.record_started(&json!({ + "model": "gpt-test", + "input": [{ + "type": "message", + "role": "user", + "content": [{"type": "input_text", "text": "hello"}] + }], + })); + attempt.record_completed("resp-1", &None, &[]); + + let rollout = replay_bundle(temp.path())?; + let inference = rollout + .inference_calls + .values() + .next() + .expect("recorded inference call"); + + assert_eq!(rollout.inference_calls.len(), 1); + assert_eq!(inference.thread_id, "thread-root"); + assert_eq!(inference.codex_turn_id, "turn-1"); + assert_eq!(inference.execution.status, ExecutionStatus::Completed); + assert_eq!(inference.upstream_request_id, Some("resp-1".to_string())); + assert_eq!(rollout.raw_payloads.len(), 2); + + Ok(()) + } + + #[test] + fn traced_response_item_preserves_reasoning_content_omitted_by_normal_serializer() { + let item = ResponseItem::Reasoning { + id: "rs-1".to_string(), + summary: vec![ReasoningItemReasoningSummary::SummaryText { + text: "summary".to_string(), + }], + content: Some(vec![ReasoningItemContent::Text { + text: "raw reasoning".to_string(), + }]), + encrypted_content: Some("encoded".to_string()), + }; + + let normal = serde_json::to_value(&item).expect("response item serializes"); + let traced = trace_response_item_json(&item); + + assert_eq!(normal.get("content"), None); + assert_eq!( + traced, + json!({ + "type": "reasoning", + "summary": [{"type": "summary_text", "text": "summary"}], + "content": [{"type": "text", "text": "raw reasoning"}], + "encrypted_content": "encoded", + }), + ); + } +} diff --git a/codex-rs/rollout-trace/src/lib.rs b/codex-rs/rollout-trace/src/lib.rs new file mode 100644 index 000000000000..5c8022d6d711 --- /dev/null +++ b/codex-rs/rollout-trace/src/lib.rs @@ -0,0 +1,49 @@ +//! Trace bundle format, writer, and reducer for Codex rollouts. +//! +//! This crate owns the trace schema. Hot-path Codex code should depend on the +//! small writer API here; semantic replay and viewer projections stay outside +//! `codex-core`. +//! +//! See `README.md` for the system diagram and reducer model. + +mod bundle; +mod compaction; +mod inference; +mod model; +mod payload; +mod raw_event; +mod reducer; +mod writer; + +/// Conventional reduced-state cache name written next to a raw trace bundle. +pub use bundle::REDUCED_STATE_FILE_NAME; +/// No-op-capable handle for recording remote-compaction requests. +pub use compaction::CompactionTraceAttempt; +/// Shared recorder context for a compaction checkpoint. +pub use compaction::CompactionTraceContext; +/// No-op-capable handle for recording one upstream inference attempt. +pub use inference::InferenceTraceAttempt; +/// Shared recorder context for inference attempts within one Codex turn. +pub use inference::InferenceTraceContext; +/// Public reduced trace model returned by replay. +pub use model::*; +/// Stable identifier for one raw payload inside a rollout bundle. +pub use payload::RawPayloadId; +/// Coarse role labels for raw payload files. +pub use payload::RawPayloadKind; +/// Reference to a raw request/response/log payload stored in the bundle. +pub use payload::RawPayloadRef; +/// Monotonic sequence number assigned by the raw trace writer. +pub use raw_event::RawEventSeq; +/// Runtime requester observed before semantic reduction. +pub use raw_event::RawToolCallRequester; +/// One append-only raw trace event from `trace.jsonl`. +pub use raw_event::RawTraceEvent; +/// Event-envelope context supplied by hot-path trace producers. +pub use raw_event::RawTraceEventContext; +/// Typed payload for one raw trace event. +pub use raw_event::RawTraceEventPayload; +/// Replay a raw trace bundle and write/read its reduced `RolloutTrace`. +pub use reducer::replay_bundle; +/// Append-only writer used by hot-path Codex instrumentation. +pub use writer::TraceWriter; diff --git a/codex-rs/rollout-trace/src/model/conversation.rs b/codex-rs/rollout-trace/src/model/conversation.rs new file mode 100644 index 000000000000..e248b72d42e3 --- /dev/null +++ b/codex-rs/rollout-trace/src/model/conversation.rs @@ -0,0 +1,176 @@ +use serde::Deserialize; +use serde::Serialize; + +use crate::payload::RawPayloadId; + +use super::AgentThreadId; +use super::CodeCellId; +use super::CodexTurnId; +use super::CompactionId; +use super::ConversationItemId; +use super::EdgeId; +use super::InferenceCallId; +use super::ModelVisibleCallId; +use super::ToolCallId; +use super::session::ExecutionWindow; + +/// One logical transcript item or transcript boundary. +/// +/// The reducer builds conversation items primarily from inference request and +/// response payloads. Runtime objects can be listed in `produced_by`, but they +/// must not rewrite what the item body says the model saw. Structural items, +/// such as compaction markers, live in the same ordered list so conversation +/// views can show where the live history changed. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct ConversationItem { + pub item_id: ConversationItemId, + pub thread_id: AgentThreadId, + /// Runtime activation that first introduced this item locally, when known. + pub codex_turn_id: Option, + pub first_seen_at_unix_ms: i64, + pub role: ConversationRole, + /// Codex channel for assistant/tool content, when the item is channel-specific. + pub channel: Option, + pub kind: ConversationItemKind, + pub body: ConversationBody, + /// Protocol/model `call_id` for function/custom tool call and output items. + pub call_id: Option, + /// Runtime or control-plane objects that caused this conversation item to exist. + pub produced_by: Vec, +} + +/// Model-visible role assigned to a conversation item. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ConversationRole { + System, + Developer, + User, + Assistant, + Tool, +} + +/// Codex channel for model-visible content. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ConversationChannel { + Analysis, + Commentary, + Final, + /// Remote compaction summaries are reintroduced as assistant summary-channel content. + Summary, +} + +/// Responses item category after normalization into the reduced transcript. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ConversationItemKind { + Message, + Reasoning, + FunctionCall, + FunctionCallOutput, + CustomToolCall, + CustomToolCallOutput, + /// Structural marker inserted where live history was replaced by compaction. + CompactionMarker, +} + +/// Ordered content parts for a reduced conversation item. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct ConversationBody { + /// Renderable model-visible parts. Raw payload refs are used when the bytes + /// are too large or too structured for the normal conversation path. + pub parts: Vec, +} + +/// One model-visible part inside a conversation item. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case", tag = "type")] +pub enum ConversationPart { + Text { + text: String, + }, + /// A model-provided summary of content whose full form may also be present. + /// + /// Reasoning summaries are not interchangeable with raw reasoning text: + /// both can be present in one payload, and replay/debug tooling needs to + /// preserve which representation the model actually returned. + Summary { + text: String, + }, + /// Opaque model-visible content that is intentionally not decoded here. + /// + /// Reasoning can be carried as `encrypted_content` with no readable text. + /// Keeping that blob inline makes it part of item identity, unlike a raw + /// payload reference whose ID changes every time the same item is replayed + /// in a later inference request. + Encoded { + label: String, + value: String, + }, + /// Small JSON-ish body represented by a summary plus a raw ref. + Json { + summary: String, + raw_payload_id: RawPayloadId, + }, + Code { + language: String, + source: String, + }, + /// Large or uncommon payload that should be lazy-loaded from details UI. + PayloadRef { + label: String, + raw_payload_id: RawPayloadId, + }, +} + +/// Explanation for where a conversation item came from. +/// +/// This is deliberately plural at the call site: a function output can be both +/// model-visible conversation and the product of a runtime tool call. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case", tag = "type")] +pub enum ProducerRef { + UserInput, + Inference { inference_call_id: InferenceCallId }, + Tool { tool_call_id: ToolCallId }, + CodeCell { code_cell_id: CodeCellId }, + InteractionEdge { edge_id: EdgeId }, + Compaction { compaction_id: CompactionId }, + Harness, +} + +/// One outbound inference request and its response metadata. +/// +/// Full upstream request/response bodies live behind raw payload refs. The +/// request/response item ID lists are the reduced, model-visible snapshot. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct InferenceCall { + pub inference_call_id: InferenceCallId, + pub thread_id: AgentThreadId, + pub codex_turn_id: CodexTurnId, + pub execution: ExecutionWindow, + pub model: String, + pub provider_name: String, + /// Upstream request ID returned by HTTP/proxy/engine infrastructure. + pub upstream_request_id: Option, + /// Complete ordered input snapshot sent with this request. + pub request_item_ids: Vec, + /// Ordered output items produced by this response. + pub response_item_ids: Vec, + /// Runtime tool calls whose model-visible call item came from this response. + pub tool_call_ids_started_by_response: Vec, + pub usage: Option, + pub raw_request_payload_id: RawPayloadId, + /// Full upstream response payload. `None` while running or after pre-stream failures. + pub raw_response_payload_id: Option, +} + +/// Token usage summary for one inference call. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct TokenUsage { + pub input_tokens: u64, + pub cached_input_tokens: u64, + pub output_tokens: u64, + pub reasoning_output_tokens: u64, +} diff --git a/codex-rs/rollout-trace/src/model/mod.rs b/codex-rs/rollout-trace/src/model/mod.rs new file mode 100644 index 000000000000..e265e2910028 --- /dev/null +++ b/codex-rs/rollout-trace/src/model/mod.rs @@ -0,0 +1,121 @@ +//! Reduced rollout trace model. +//! +//! These types describe the deterministic replay output. They intentionally +//! separate model-visible conversation from runtime/debug objects. + +use std::collections::BTreeMap; + +use serde::Deserialize; +use serde::Serialize; + +use crate::payload::RawPayloadId; +use crate::payload::RawPayloadRef; +mod conversation; +mod runtime; +mod session; + +pub use conversation::*; +pub use runtime::*; +pub use session::*; + +/// Codex conversation/session UUID. +pub type AgentThreadId = String; +/// Stable multi-agent routing path such as `/root` or `/root/search_docs`. +pub type AgentPath = String; +/// Runtime submission/activation UUID. This is not a chat turn. +pub type CodexTurnId = String; +/// Reduced transcript item ID assigned by the trace reducer. +pub type ConversationItemId = String; +/// Local ID for one outbound upstream inference request. +pub type InferenceCallId = String; +/// Reducer-owned ID for one runtime tool-call object. +pub type ToolCallId = String; +/// Responses `call_id` / custom-tool call ID visible in inference payloads. +pub type ModelVisibleCallId = String; +/// Tool invocation ID assigned inside the code-mode JavaScript runtime. +pub type CodeModeRuntimeToolId = String; +/// Reducer-owned ID for one model-authored `exec` JavaScript cell. +pub type CodeCellId = String; +/// Process/session ID returned by Codex's terminal runtime. +pub type TerminalId = String; +/// Reducer-owned ID for one command/write/poll operation against a terminal. +pub type TerminalOperationId = String; +/// Reducer-owned ID for one installed conversation-history checkpoint. +pub type CompactionId = String; +/// Reducer-owned ID for one upstream request that computes a compaction. +pub type CompactionRequestId = String; +/// Reducer-owned ID for one information-flow edge. +pub type EdgeId = String; +/// Reducer-owned ID for request/log correlation metadata. +pub type CorrelationId = String; + +/// Canonical reduced graph for one Codex rollout. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct RolloutTrace { + pub schema_version: u32, + /// Unique identity for this trace capture. + /// + /// `rollout_id` names the Codex rollout/session being observed. `trace_id` + /// names the diagnostic artifact produced for that rollout, which keeps + /// storage/replay identity separate from the product-level session identity. + pub trace_id: String, + /// CLI-visible rollout/run identity. Higher-level experiment/sample IDs wrap this object. + pub rollout_id: String, + pub started_at_unix_ms: i64, + /// Wall-clock timestamp for terminal rollout status. `None` means running or partial trace. + pub ended_at_unix_ms: Option, + pub status: RolloutStatus, + pub root_thread_id: AgentThreadId, + pub threads: BTreeMap, + pub codex_turns: BTreeMap, + pub conversation_items: BTreeMap, + pub inference_calls: BTreeMap, + /// Model-authored `exec` JavaScript cells keyed by reducer-owned cell ID. + pub code_cells: BTreeMap, + pub tool_calls: BTreeMap, + /// Terminal runtime sessions keyed by process/session ID returned by the runtime. + pub terminal_sessions: BTreeMap, + /// Commands/writes/polls against terminals keyed by reducer-owned operation ID. + pub terminal_operations: BTreeMap, + /// Installed compaction checkpoints keyed by checkpoint ID. + pub compactions: BTreeMap, + /// Upstream remote compaction calls keyed by local request ID. + pub compaction_requests: BTreeMap, + /// Information-flow edges between threads, cells, tools, and runtime resources. + pub interaction_edges: BTreeMap, + /// Raw JSON payloads keyed by raw-payload ID. Most point at files outside this object. + pub raw_payloads: BTreeMap, +} + +impl RolloutTrace { + /// Builds an empty reduced trace that a reducer can populate. + pub(crate) fn new( + schema_version: u32, + trace_id: String, + rollout_id: String, + root_thread_id: AgentThreadId, + started_at_unix_ms: i64, + ) -> Self { + Self { + schema_version, + trace_id, + rollout_id, + started_at_unix_ms, + ended_at_unix_ms: None, + status: RolloutStatus::Running, + root_thread_id, + threads: BTreeMap::new(), + codex_turns: BTreeMap::new(), + conversation_items: BTreeMap::new(), + inference_calls: BTreeMap::new(), + code_cells: BTreeMap::new(), + tool_calls: BTreeMap::new(), + terminal_sessions: BTreeMap::new(), + terminal_operations: BTreeMap::new(), + compactions: BTreeMap::new(), + compaction_requests: BTreeMap::new(), + interaction_edges: BTreeMap::new(), + raw_payloads: BTreeMap::new(), + } + } +} diff --git a/codex-rs/rollout-trace/src/model/runtime.rs b/codex-rs/rollout-trace/src/model/runtime.rs new file mode 100644 index 000000000000..a2afcb0636dc --- /dev/null +++ b/codex-rs/rollout-trace/src/model/runtime.rs @@ -0,0 +1,331 @@ +use serde::Deserialize; +use serde::Serialize; + +use crate::payload::RawPayloadId; +use crate::raw_event::RawEventSeq; + +use super::AgentPath; +use super::AgentThreadId; +use super::CodeCellId; +use super::CodeModeRuntimeToolId; +use super::CodexTurnId; +use super::CompactionId; +use super::CompactionRequestId; +use super::ConversationItemId; +use super::EdgeId; +use super::ModelVisibleCallId; +use super::TerminalId; +use super::TerminalOperationId; +use super::ToolCallId; +use super::session::ExecutionWindow; + +/// Runtime/debug object for one model-authored `exec` cell. +/// +/// The JavaScript source and custom-tool outputs are still conversation items; +/// this object tracks the code-mode runtime boundary and nested runtime work. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct CodeCell { + /// Reducer-owned graph id derived from the model-visible `exec` call id. + /// Runtime cell ids are stored separately because they are only handles for + /// later waits and nested code-mode tools. + pub code_cell_id: CodeCellId, + pub model_visible_call_id: ModelVisibleCallId, + pub thread_id: AgentThreadId, + pub codex_turn_id: CodexTurnId, + /// Conversation item containing the model-authored JavaScript. + pub source_item_id: ConversationItemId, + pub output_item_ids: Vec, + /// Raw code-mode runtime/session id, useful when matching runtime payloads. + pub runtime_cell_id: Option, + /// Full JS-cell runtime window; yielded cells can outlive the initial custom call. + pub execution: ExecutionWindow, + pub runtime_status: CodeCellRuntimeStatus, + pub initial_response_at_unix_ms: Option, + pub initial_response_seq: Option, + pub yielded_at_unix_ms: Option, + pub yielded_seq: Option, + pub source_js: String, + pub nested_tool_call_ids: Vec, + pub wait_tool_call_ids: Vec, +} + +/// Code-mode runtime lifecycle. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum CodeCellRuntimeStatus { + /// The `exec` request has been accepted but the runtime has not yet started user code. + Starting, + /// Runtime is executing JavaScript and has not yet yielded or terminated. + Running, + /// Initial `exec` returned while JavaScript kept running in the background. + Yielded, + /// Runtime reached a normal terminal result. + Completed, + /// Runtime reached an error terminal result. + Failed, + /// Runtime was explicitly terminated. + Terminated, +} + +/// Installed conversation-history replacement boundary. +/// +/// Duration-bearing upstream requests live in `CompactionRequest`. This object +/// is the checkpoint where replacement history became the live thread history. +/// The boundary marker and the model-visible summary are separate conversation +/// items: the marker says where history was replaced, while the summary is part +/// of `replacement_item_ids` when the compact endpoint returned one. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct Compaction { + pub compaction_id: CompactionId, + pub thread_id: AgentThreadId, + pub codex_turn_id: CodexTurnId, + pub installed_at_unix_ms: i64, + /// Structural conversation item marking where pre-compaction history ended. + pub marker_item_id: ConversationItemId, + /// Upstream compaction request attempts that contributed to this checkpoint. + pub request_ids: Vec, + /// Logical conversation items present immediately before replacement. + pub input_item_ids: Vec, + /// Replacement conversation items installed by the checkpoint. + pub replacement_item_ids: Vec, +} + +/// One upstream remote request made while computing a compaction checkpoint. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct CompactionRequest { + pub compaction_request_id: CompactionRequestId, + pub compaction_id: CompactionId, + pub thread_id: AgentThreadId, + pub codex_turn_id: CodexTurnId, + pub execution: ExecutionWindow, + pub model: String, + pub provider_name: String, + pub raw_request_payload_id: RawPayloadId, + /// Full compaction response payload. `None` while running or after pre-response failures. + pub raw_response_payload_id: Option, +} + +/// Runtime operation requested by the model, a JS code cell, or Codex itself. +/// +/// A `ToolCall` is not a chat transcript row. Model-visible call/output items +/// link to it through `model_visible_*_item_ids`; runtime-only tools can have +/// empty model-visible lists. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct ToolCall { + pub tool_call_id: ToolCallId, + /// Model-visible protocol call ID, if the model directly requested this tool. + pub model_visible_call_id: Option, + /// Code-mode runtime's internal tool invocation ID, if this call came from JS. + pub code_mode_runtime_tool_id: Option, + pub thread_id: AgentThreadId, + /// Runtime activation that started the tool. Background work may outlive this turn. + pub started_by_codex_turn_id: Option, + pub execution: ExecutionWindow, + pub requester: ToolCallRequester, + pub kind: ToolCallKind, + pub model_visible_call_item_ids: Vec, + pub model_visible_output_item_ids: Vec, + /// Terminal operation started by this tool, when the tool touched a terminal. + pub terminal_operation_id: Option, + pub summary: ToolCallSummary, + /// Original invocation at the Codex tool boundary. + /// + /// Direct model tools store the model's function/custom call payload here. + /// Code-mode nested tools store the JSON call made by model-authored JS. + /// Runtime protocol events are deliberately kept separate below because + /// they describe how Codex executed the request, not what the caller sent. + pub raw_invocation_payload_id: Option, + /// Result returned to the immediate requester. + /// + /// For direct tools this is the tool output item returned to the model; for + /// code-mode nested tools this is the value returned to JavaScript. + pub raw_result_payload_id: Option, + /// Runtime/protocol payloads observed while executing the tool. + /// + /// Examples include exec begin/end, patch begin/end, and MCP begin/end + /// events. Reducers can use these to build richer runtime objects such as + /// terminal operations without overwriting the canonical invocation/result. + pub raw_runtime_payload_ids: Vec, +} + +/// Requester of a runtime tool. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case", tag = "type")] +pub enum ToolCallRequester { + Model, + /// Model-authored JavaScript requested the tool through code-mode. + CodeCell { + code_cell_id: CodeCellId, + }, +} + +/// Runtime tool category. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case", tag = "type")] +pub enum ToolCallKind { + ExecCommand, + WriteStdin, + ApplyPatch, + Mcp { + server: String, + tool: String, + }, + Web, + ImageGeneration, + SpawnAgent, + AssignAgentTask, + SendMessage, + /// Multi-agent wait operation. Code-mode wait is modeled separately. + WaitAgent, + CloseAgent, + Other { + name: String, + }, +} + +/// Bounded card/list summary for a tool call. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case", tag = "type")] +pub enum ToolCallSummary { + /// Tool is summarized by its terminal operation. + Terminal { operation_id: TerminalOperationId }, + Agent { + target_agent_path: AgentPath, + /// Task name/path segment when the operation creates or targets a task. + task_name: Option, + message_preview: String, + }, + WaitAgent { + /// Wait target, when narrower than "any child". + target_agent_path: Option, + timeout_ms: Option, + }, + Generic { + label: String, + input_preview: Option, + output_preview: Option, + }, +} + +/// Reusable terminal process/session returned by the runtime. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct TerminalSession { + pub terminal_id: TerminalId, + pub thread_id: AgentThreadId, + pub created_by_operation_id: TerminalOperationId, + pub operation_ids: Vec, + /// Terminal lifetime. This can outlive the operation that created it. + pub execution: ExecutionWindow, +} + +/// One command/write/poll operation against a terminal session. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct TerminalOperation { + pub operation_id: TerminalOperationId, + /// Runtime terminal/process ID. `None` is legal only while the operation that creates it is starting. + pub terminal_id: Option, + pub tool_call_id: ToolCallId, + pub kind: TerminalOperationKind, + /// Operation execution window. This is not necessarily the terminal session lifetime. + pub execution: ExecutionWindow, + pub request: TerminalRequest, + /// Runtime-observed terminal result. Model-visible output links through observations. + pub result: Option, + pub model_observations: Vec, + pub raw_payload_ids: Vec, +} + +/// Terminal operation category. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum TerminalOperationKind { + ExecCommand, + WriteStdin, +} + +/// Terminal request summary. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case", tag = "type")] +pub enum TerminalRequest { + ExecCommand { + command: Vec, + display_command: String, + cwd: String, + yield_time_ms: Option, + max_output_tokens: Option, + }, + /// Request to interact with an existing terminal. + WriteStdin { + /// Bytes/text sent to stdin. Empty string means poll/read without writing bytes. + stdin: String, + yield_time_ms: Option, + max_output_tokens: Option, + }, +} + +/// Terminal result observed by the runtime. +/// +/// This is debugger/runtime output. It is not proof that the model saw the same +/// bytes; link model-visible call/output items through `TerminalModelObservation`. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct TerminalResult { + /// Process exit code. `None` if the process is still running or no exit status was produced. + pub exit_code: Option, + pub stdout: String, + pub stderr: String, + /// Tool runtime's formatted caller-facing output, when present. + pub formatted_output: Option, + /// Token count before truncation, when the tool runtime reported it. + pub original_token_count: Option, + /// Streaming chunk ID, when this result was assembled from chunked terminal output. + pub chunk_id: Option, +} + +/// Conversation items that observed a terminal operation. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct TerminalModelObservation { + pub call_item_ids: Vec, + pub output_item_ids: Vec, + pub source: TerminalObservationSource, +} + +/// Source of model-visible terminal observation. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum TerminalObservationSource { + DirectToolCall, + CodeCellOutput, +} + +/// Directed information-flow relationship between trace objects. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct InteractionEdge { + pub edge_id: EdgeId, + pub kind: InteractionEdgeKind, + pub source: TraceAnchor, + pub target: TraceAnchor, + pub started_at_unix_ms: i64, + pub ended_at_unix_ms: Option, + pub carried_item_ids: Vec, + pub carried_raw_payload_ids: Vec, +} + +/// Information-flow edge category. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum InteractionEdgeKind { + SpawnAgent, + AssignAgentTask, + SendMessage, + AgentResult, + CloseAgent, +} + +/// Typed pointer to one stable reduced-trace object. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case", tag = "type")] +pub enum TraceAnchor { + ConversationItem { item_id: ConversationItemId }, + ToolCall { tool_call_id: ToolCallId }, + Thread { thread_id: AgentThreadId }, +} diff --git a/codex-rs/rollout-trace/src/model/session.rs b/codex-rs/rollout-trace/src/model/session.rs new file mode 100644 index 000000000000..fccc386dc932 --- /dev/null +++ b/codex-rs/rollout-trace/src/model/session.rs @@ -0,0 +1,110 @@ +use serde::Deserialize; +use serde::Serialize; + +use crate::raw_event::RawEventSeq; + +use super::AgentPath; +use super::AgentThreadId; +use super::CodexTurnId; +use super::ConversationItemId; +use super::EdgeId; + +/// Coarse terminal status for the rollout. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum RolloutStatus { + /// Writer has not seen a terminal rollout event. + Running, + /// Rollout ended normally. + Completed, + /// Rollout ended because an operation failed. + Failed, + /// Rollout was cancelled or otherwise stopped before normal completion. + Aborted, +} + +/// One Codex thread/session participating in the rollout. +/// +/// Threads are agents in the multi-agent sense, but the root interactive +/// session is represented by the same object. Runtime objects live in top-level +/// maps and point back to their owning thread; only transcript order is stored +/// here because compaction/reconciliation makes it semantic. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct AgentThread { + pub thread_id: AgentThreadId, + /// Stable routing identity. Viewer/search should prefer this over nickname. + pub agent_path: AgentPath, + /// Presentation hint. It can collide and must not be used as identity. + pub nickname: Option, + pub origin: AgentOrigin, + /// Session lifecycle for this thread. + /// + /// Child threads can end independently from the root rollout, for example + /// after a parent calls `close_agent`. Keeping this on the thread prevents + /// those shutdowns from being mistaken for whole-rollout completion. + pub execution: ExecutionWindow, + /// Configured model presentation hint. Individual inference calls carry the actual upstream model. + pub default_model: Option, + /// Logical conversation items first observed for this thread, in transcript order. + pub conversation_item_ids: Vec, +} + +/// Provenance for a traced Codex thread. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case", tag = "type")] +pub enum AgentOrigin { + Root, + Spawned { + parent_thread_id: AgentThreadId, + /// Interaction edge that carried the spawn task. + spawn_edge_id: EdgeId, + /// Stable path segment/task name selected by the parent/tool call. + task_name: String, + /// Selected agent role/type, for example `worker` or `explorer`. + agent_role: String, + }, +} + +/// Runtime interval for a typed trace object. +/// +/// Wall-clock timestamps are for display and latency. Sequence numbers are the +/// causal ordering primitive and should be used to pair observations or break +/// same-millisecond ties. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct ExecutionWindow { + pub started_at_unix_ms: i64, + pub started_seq: RawEventSeq, + pub ended_at_unix_ms: Option, + pub ended_seq: Option, + pub status: ExecutionStatus, +} + +/// Coarse lifecycle status for a runtime object. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ExecutionStatus { + /// Object is still live or the trace ended before its terminal event. + Running, + /// Object completed successfully. + Completed, + /// Object reached an error state. + Failed, + /// Object was cancelled by user/policy/runtime before completion. + Cancelled, + /// Object was aborted when its owner/runtime stopped. + Aborted, +} + +/// One activation of the Codex runtime for one thread. +/// +/// A Codex turn groups protocol/runtime work for one thread activation. +/// It is not a user/assistant message pair; conversation belongs in +/// `ConversationItem`. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct CodexTurn { + pub codex_turn_id: CodexTurnId, + pub thread_id: AgentThreadId, + pub execution: ExecutionWindow, + /// Conversation items that directly triggered this activation, when known. + pub input_item_ids: Vec, +} diff --git a/codex-rs/rollout-trace/src/payload.rs b/codex-rs/rollout-trace/src/payload.rs new file mode 100644 index 000000000000..5efc7dc3eebe --- /dev/null +++ b/codex-rs/rollout-trace/src/payload.rs @@ -0,0 +1,49 @@ +//! References to heavyweight trace payloads stored outside the reduced graph. + +use serde::Deserialize; +use serde::Serialize; + +/// Stable identifier for one raw payload inside a rollout bundle. +pub type RawPayloadId = String; + +/// Reference to a raw request/response/log payload. +/// +/// `RolloutTrace` stores these references so normal timeline and conversation +/// rendering does not require the browser or reducer output to inline every +/// upstream request, tool response, or terminal log. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct RawPayloadRef { + pub raw_payload_id: RawPayloadId, + /// Payload role. This lets details UI choose syntax highlighting and labels + /// without opening the payload file first. + pub kind: RawPayloadKind, + /// Path relative to the trace bundle root. + /// + /// The writer always materializes payloads as bundle-local files. Keeping + /// this as a plain path avoids exposing storage modes we do not produce. + pub path: String, +} + +/// Coarse role of a raw payload. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)] +#[serde(rename_all = "snake_case", tag = "type", content = "value")] +pub enum RawPayloadKind { + InferenceRequest, + /// Full upstream inference response or non-delta response stream summary. + InferenceResponse, + CompactionRequest, + /// Trace-only checkpoint captured when processed replacement history is installed. + CompactionCheckpoint, + CompactionResponse, + ToolInvocation, + ToolResult, + /// Raw runtime/protocol observation for an executing tool. + ToolRuntimeEvent, + /// Raw terminal runtime event or stream shard. + TerminalRuntimeEvent, + ProtocolEvent, + /// One-shot metadata captured when a Codex session/thread starts. + SessionMetadata, + /// Runtime notification payload carried when a child agent reports back to its parent. + AgentResult, +} diff --git a/codex-rs/rollout-trace/src/raw_event.rs b/codex-rs/rollout-trace/src/raw_event.rs new file mode 100644 index 000000000000..b364601408e4 --- /dev/null +++ b/codex-rs/rollout-trace/src/raw_event.rs @@ -0,0 +1,285 @@ +//! Append-only raw trace events. + +use crate::model::AgentThreadId; +use crate::model::CodeCellRuntimeStatus; +use crate::model::CodexTurnId; +use crate::model::CompactionId; +use crate::model::CompactionRequestId; +use crate::model::EdgeId; +use crate::model::ExecutionStatus; +use crate::model::InferenceCallId; +use crate::model::ModelVisibleCallId; +use crate::model::RolloutStatus; +use crate::model::ToolCallId; +use crate::model::ToolCallKind; +use crate::model::ToolCallSummary; +use crate::payload::RawPayloadRef; +use serde::Deserialize; +use serde::Serialize; +use serde_json::Value; + +/// Monotonic sequence number assigned by the raw trace writer. +pub type RawEventSeq = u64; + +/// Current raw event envelope schema version. +pub(crate) const RAW_TRACE_EVENT_SCHEMA_VERSION: u32 = 1; + +/// One append-only raw trace event. +/// +/// Every event uses the same envelope so partial replay and corruption checks +/// can run before the reducer understands the event-specific payload. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct RawTraceEvent { + pub schema_version: u32, + /// Contiguous writer-assigned order inside one rollout event log. + pub seq: RawEventSeq, + /// Unix wall-clock timestamp in milliseconds. Use for display/latency. + pub wall_time_unix_ms: i64, + pub rollout_id: String, + pub thread_id: Option, + pub codex_turn_id: Option, + pub payload: RawTraceEventPayload, +} + +/// Writer-supplied context that appears in the raw event envelope. +#[derive(Debug, Clone, Default, PartialEq, Eq)] +pub struct RawTraceEventContext { + pub thread_id: Option, + pub codex_turn_id: Option, +} + +/// Runtime requester as observed at the raw tool boundary. +/// +/// This intentionally uses runtime-local identifiers. The reducer is the only +/// place that maps these handles to graph identities such as `CodeCellId`. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case", tag = "type")] +pub enum RawToolCallRequester { + Model, + CodeCell { + /// Runtime-local code-mode cell handle. + runtime_cell_id: String, + }, +} + +/// Typed payload for a raw trace event. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case", tag = "type")] +pub enum RawTraceEventPayload { + RolloutStarted { + trace_id: String, + root_thread_id: AgentThreadId, + }, + RolloutEnded { + status: RolloutStatus, + }, + ThreadStarted { + thread_id: AgentThreadId, + /// Stable agent path. + agent_path: String, + metadata_payload: Option, + }, + ThreadEnded { + thread_id: AgentThreadId, + status: RolloutStatus, + }, + CodexTurnStarted { + codex_turn_id: CodexTurnId, + thread_id: AgentThreadId, + }, + CodexTurnEnded { + codex_turn_id: CodexTurnId, + status: ExecutionStatus, + }, + InferenceStarted { + inference_call_id: InferenceCallId, + thread_id: AgentThreadId, + codex_turn_id: CodexTurnId, + model: String, + provider_name: String, + request_payload: RawPayloadRef, + }, + InferenceCompleted { + inference_call_id: InferenceCallId, + response_id: Option, + response_payload: RawPayloadRef, + }, + InferenceFailed { + inference_call_id: InferenceCallId, + error: String, + /// Partial response payload, when stream events arrived before failure. + partial_response_payload: Option, + }, + ToolCallStarted { + tool_call_id: ToolCallId, + /// Protocol/model call ID when this runtime call came from model output. + model_visible_call_id: Option, + /// Code-mode runtime bridge ID when model-authored code issued this call. + code_mode_runtime_tool_id: Option, + /// Runtime requester that caused this tool lifecycle. + requester: RawToolCallRequester, + kind: ToolCallKind, + summary: ToolCallSummary, + invocation_payload: Option, + }, + ToolCallRuntimeStarted { + tool_call_id: ToolCallId, + /// Runtime/protocol observation for how Codex began executing the tool. + runtime_payload: RawPayloadRef, + }, + ToolCallRuntimeEnded { + tool_call_id: ToolCallId, + status: ExecutionStatus, + /// Runtime/protocol observation for how Codex finished executing the tool. + runtime_payload: RawPayloadRef, + }, + ToolCallEnded { + tool_call_id: ToolCallId, + status: ExecutionStatus, + result_payload: Option, + }, + CodeCellStarted { + /// Runtime-local handle allocated by code mode for waits and nested tools. + runtime_cell_id: String, + /// Custom tool call id on the model-visible `exec` item. + model_visible_call_id: ModelVisibleCallId, + /// JavaScript source after the public `exec` wrapper has been parsed. + source_js: String, + }, + CodeCellInitialResponse { + /// Runtime-local handle, matching `CodeCellStarted`. + runtime_cell_id: String, + status: CodeCellRuntimeStatus, + response_payload: Option, + }, + CodeCellEnded { + /// Runtime-local handle, matching `CodeCellStarted`. + runtime_cell_id: String, + status: CodeCellRuntimeStatus, + response_payload: Option, + }, + CompactionRequestStarted { + compaction_id: CompactionId, + compaction_request_id: CompactionRequestId, + thread_id: AgentThreadId, + codex_turn_id: CodexTurnId, + model: String, + provider_name: String, + request_payload: RawPayloadRef, + }, + CompactionRequestCompleted { + compaction_id: CompactionId, + compaction_request_id: CompactionRequestId, + response_payload: RawPayloadRef, + }, + CompactionRequestFailed { + compaction_id: CompactionId, + compaction_request_id: CompactionRequestId, + error: String, + }, + /// Checkpoint installation event for remote-compacted replacement history. + CompactionInstalled { + compaction_id: CompactionId, + /// Trace-only checkpoint payload. Do not route this through public UI protocol. + checkpoint_payload: RawPayloadRef, + }, + /// Multi-agent v2 child-to-parent completion delivery. + AgentResultObserved { + edge_id: EdgeId, + child_thread_id: AgentThreadId, + child_codex_turn_id: CodexTurnId, + parent_thread_id: AgentThreadId, + message: String, + /// Raw notification payload. This is evidence for the runtime delivery, + /// not the parent-side model-visible item. + carried_payload: Option, + }, + /// Existing UI/protocol event wrapped into trace format. + ProtocolEventObserved { + event_type: String, + event_payload: RawPayloadRef, + }, + /// Structured payload for early instrumentation before a dedicated variant exists. + Other { + kind: String, + summary: String, + payloads: Vec, + /// Small structured metadata. Large data belongs in `payloads`. + metadata: Value, + }, +} + +impl RawTraceEventPayload { + /// Raw payload refs that must exist before this raw event is appended. + pub(crate) fn raw_payload_refs(&self) -> Vec<&RawPayloadRef> { + match self { + RawTraceEventPayload::RolloutStarted { .. } + | RawTraceEventPayload::RolloutEnded { .. } + | RawTraceEventPayload::ThreadEnded { .. } + | RawTraceEventPayload::CodexTurnStarted { .. } + | RawTraceEventPayload::CodexTurnEnded { .. } + | RawTraceEventPayload::CompactionRequestFailed { .. } + | RawTraceEventPayload::CodeCellStarted { .. } + | RawTraceEventPayload::AgentResultObserved { + carried_payload: None, + .. + } => Vec::new(), + RawTraceEventPayload::ThreadStarted { + metadata_payload, .. + } => metadata_payload.iter().collect(), + RawTraceEventPayload::InferenceStarted { + request_payload, .. + } + | RawTraceEventPayload::InferenceCompleted { + response_payload: request_payload, + .. + } + | RawTraceEventPayload::CompactionRequestStarted { + request_payload, .. + } + | RawTraceEventPayload::CompactionRequestCompleted { + response_payload: request_payload, + .. + } + | RawTraceEventPayload::CompactionInstalled { + checkpoint_payload: request_payload, + .. + } + | RawTraceEventPayload::ProtocolEventObserved { + event_payload: request_payload, + .. + } => vec![request_payload], + RawTraceEventPayload::InferenceFailed { + partial_response_payload, + .. + } + | RawTraceEventPayload::ToolCallStarted { + invocation_payload: partial_response_payload, + .. + } + | RawTraceEventPayload::ToolCallEnded { + result_payload: partial_response_payload, + .. + } + | RawTraceEventPayload::CodeCellInitialResponse { + response_payload: partial_response_payload, + .. + } + | RawTraceEventPayload::CodeCellEnded { + response_payload: partial_response_payload, + .. + } => partial_response_payload.iter().collect(), + RawTraceEventPayload::AgentResultObserved { + carried_payload: Some(carried_payload), + .. + } => vec![carried_payload], + RawTraceEventPayload::ToolCallRuntimeStarted { + runtime_payload, .. + } + | RawTraceEventPayload::ToolCallRuntimeEnded { + runtime_payload, .. + } => vec![runtime_payload], + RawTraceEventPayload::Other { payloads, .. } => payloads.iter().collect(), + } + } +} diff --git a/codex-rs/rollout-trace/src/reducer/code_cell.rs b/codex-rs/rollout-trace/src/reducer/code_cell.rs new file mode 100644 index 000000000000..11ad55f48ebd --- /dev/null +++ b/codex-rs/rollout-trace/src/reducer/code_cell.rs @@ -0,0 +1,738 @@ +//! Code-mode reduction. +//! +//! A code cell is the runtime parent for model-authored `exec` +//! JavaScript. Nested tools, waits, and terminal operations hang off this +//! object so viewers can inspect runtime work without flattening it into the +//! model-visible conversation. +//! +//! The reducer has to reconcile two clocks: +//! - model-visible items come from inference request/response payloads; +//! - runtime work starts as soon as Codex dispatches the tool. +//! +//! In real traces `CodeCellStarted` can arrive before the inference completion +//! payload that contains the `custom_tool_call` item. We therefore queue starts +//! until their source conversation item exists, then attach runtime edges. + +use anyhow::Context; +use anyhow::Result; +use anyhow::bail; +use serde_json::Value; + +use super::TraceReducer; +use crate::model::CodeCell; +use crate::model::CodeCellId; +use crate::model::CodeCellRuntimeStatus; +use crate::model::ConversationItemKind; +use crate::model::ExecutionStatus; +use crate::model::ExecutionWindow; +use crate::model::ProducerRef; +use crate::model::ToolCallId; +use crate::model::ToolCallRequester; +use crate::payload::RawPayloadRef; +use crate::raw_event::RawEventSeq; +use crate::raw_event::RawToolCallRequester; + +/// Runtime start payload for one model-authored code-mode exec call. +/// +/// The reduced id is already derived from the model-visible call id before this +/// reaches the code-cell reducer, so the reducer can reconcile runtime lifecycle +/// events against a stable graph identity. +pub(super) struct StartedCodeCell { + pub(super) code_cell_id: CodeCellId, + pub(super) runtime_cell_id: String, + pub(super) model_visible_call_id: crate::model::ModelVisibleCallId, + pub(super) source_js: String, +} + +/// Queued code-cell start waiting for its model-visible source item. +/// +/// Code execution can begin before inference stream completion records the +/// custom-tool call item that authored it. This wrapper keeps the original +/// event timing intact until that source item exists. +pub(super) struct PendingCodeCellStart { + pub(super) seq: RawEventSeq, + pub(super) wall_time_unix_ms: i64, + pub(super) thread_id: String, + pub(super) codex_turn_id: Option, + pub(super) started: StartedCodeCell, +} + +/// Lifecycle event observed before a queued code cell has materialized. +/// +/// These events are replayed after the start is resolved so failed or very fast +/// cells do not lose runtime status while preserving source-item ownership. +pub(super) struct PendingCodeCellLifecycleEvent { + pub(super) seq: RawEventSeq, + pub(super) wall_time_unix_ms: i64, + pub(super) kind: PendingCodeCellLifecycleEventKind, +} + +/// Runtime lifecycle transitions that can arrive while a code-cell start is queued. +pub(super) enum PendingCodeCellLifecycleEventKind { + InitialResponse { + runtime_cell_id: String, + status: CodeCellRuntimeStatus, + }, + Ended { + status: CodeCellRuntimeStatus, + }, +} + +impl TraceReducer { + /// Starts a code cell once its model-visible source item exists. + /// + /// Runtime events are allowed to arrive before stream completion has + /// reduced the model output that requested `exec`. Queueing preserves the + /// event order while still requiring every final `CodeCell` to point at the + /// exact conversation item that authored its JavaScript. + pub(super) fn start_or_queue_code_cell(&mut self, pending: PendingCodeCellStart) -> Result<()> { + let code_cell_id = pending.started.code_cell_id.clone(); + if self + .source_item_id_for_pending_code_cell(&pending)? + .is_none() + { + if self.rollout.code_cells.contains_key(&code_cell_id) + || self.pending_code_cell_starts.contains_key(&code_cell_id) + { + bail!("duplicate code cell start for {code_cell_id}"); + } + self.pending_code_cell_starts.insert(code_cell_id, pending); + return Ok(()); + } + + self.start_code_cell(pending) + } + + /// Materializes any queued code-cell starts unlocked by newly reduced conversation items. + /// + /// This is called after inference and compaction conversation reduction, + /// because those are the only paths that create model-visible items today. + pub(super) fn flush_pending_code_cell_starts(&mut self) -> Result<()> { + let mut ready_ids = Vec::new(); + for (code_cell_id, pending) in &self.pending_code_cell_starts { + if self + .source_item_id_for_pending_code_cell(pending)? + .is_some() + { + ready_ids.push(code_cell_id.clone()); + } + } + + for code_cell_id in ready_ids { + let Some(pending) = self.pending_code_cell_starts.remove(&code_cell_id) else { + continue; + }; + self.start_code_cell(pending)?; + } + Ok(()) + } + + /// Inserts the reduced `CodeCell` once source ownership can be proven. + fn start_code_cell(&mut self, pending: PendingCodeCellStart) -> Result<()> { + let PendingCodeCellStart { + seq, + wall_time_unix_ms, + thread_id, + codex_turn_id, + started, + } = pending; + if self.rollout.code_cells.contains_key(&started.code_cell_id) { + bail!("duplicate code cell start for {}", started.code_cell_id); + } + + let Some(codex_turn_id) = codex_turn_id else { + bail!( + "code cell start {} did not include a Codex turn id", + started.code_cell_id + ); + }; + self.validate_code_cell_turn(&thread_id, &codex_turn_id)?; + + let source_item_id = self.source_item_id_for_code_cell_start( + &thread_id, + &started.code_cell_id, + &started.model_visible_call_id, + )?; + let output_item_ids = self.model_visible_code_cell_item_ids( + &thread_id, + &started.model_visible_call_id, + ConversationItemKind::CustomToolCallOutput, + ); + // Runtime events may also have arrived while the start was queued. + // Seed these reverse links from already-reduced tool calls so replay is + // order-insensitive within the known trace causality. + let requester = ToolCallRequester::CodeCell { + code_cell_id: started.code_cell_id.clone(), + }; + let nested_tool_call_ids = self + .rollout + .tool_calls + .values() + .filter(|tool_call| tool_call.requester == requester) + .map(|tool_call| tool_call.tool_call_id.clone()) + .collect(); + + self.rollout.code_cells.insert( + started.code_cell_id.clone(), + CodeCell { + code_cell_id: started.code_cell_id.clone(), + model_visible_call_id: started.model_visible_call_id, + thread_id: thread_id.clone(), + codex_turn_id, + source_item_id, + output_item_ids: output_item_ids.clone(), + runtime_cell_id: Some(started.runtime_cell_id), + execution: ExecutionWindow { + started_at_unix_ms: wall_time_unix_ms, + started_seq: seq, + ended_at_unix_ms: None, + ended_seq: None, + status: ExecutionStatus::Running, + }, + runtime_status: CodeCellRuntimeStatus::Starting, + initial_response_at_unix_ms: None, + initial_response_seq: None, + yielded_at_unix_ms: None, + yielded_seq: None, + source_js: started.source_js, + nested_tool_call_ids, + wait_tool_call_ids: Vec::new(), + }, + ); + + self.thread_mut(&thread_id)?; + + for item_id in output_item_ids { + self.add_code_cell_output_item(&started.code_cell_id, &item_id)?; + } + self.flush_pending_code_cell_lifecycle_events(&started.code_cell_id)?; + + Ok(()) + } + + /// Returns the source item if the model-visible `exec` call has been reduced. + fn source_item_id_for_pending_code_cell( + &self, + pending: &PendingCodeCellStart, + ) -> Result> { + Ok(self + .model_visible_code_cell_item_ids( + &pending.thread_id, + &pending.started.model_visible_call_id, + ConversationItemKind::CustomToolCall, + ) + .into_iter() + .next()) + } + + /// Records the runtime's first response for a code cell, or waits for its source item. + /// + /// Code-mode execution can start and fail before the inference response payload + /// that introduced the model-visible `exec` call has been reduced. In that + /// case the cell start is already pending; keep the lifecycle event beside it + /// instead of weakening the invariant that every reduced cell has a source + /// conversation item. + pub(super) fn record_or_queue_code_cell_initial_response( + &mut self, + seq: RawEventSeq, + wall_time_unix_ms: i64, + code_cell_id: CodeCellId, + runtime_cell_id: String, + status: CodeCellRuntimeStatus, + ) -> Result<()> { + if !self.rollout.code_cells.contains_key(&code_cell_id) { + if self.pending_code_cell_starts.contains_key(&code_cell_id) { + self.queue_code_cell_lifecycle_event( + code_cell_id, + PendingCodeCellLifecycleEvent { + seq, + wall_time_unix_ms, + kind: PendingCodeCellLifecycleEventKind::InitialResponse { + runtime_cell_id, + status, + }, + }, + ); + return Ok(()); + } + bail!("code cell initial response referenced unknown cell {code_cell_id}"); + } + self.record_code_cell_initial_response( + seq, + wall_time_unix_ms, + code_cell_id, + runtime_cell_id, + status, + ) + } + + fn record_code_cell_initial_response( + &mut self, + seq: RawEventSeq, + wall_time_unix_ms: i64, + code_cell_id: CodeCellId, + runtime_cell_id: String, + status: CodeCellRuntimeStatus, + ) -> Result<()> { + let Some(cell) = self.rollout.code_cells.get_mut(&code_cell_id) else { + bail!("code cell initial response referenced unknown cell {code_cell_id}"); + }; + + cell.runtime_cell_id = Some(runtime_cell_id); + if cell.initial_response_at_unix_ms.is_none() { + cell.initial_response_at_unix_ms = Some(wall_time_unix_ms); + cell.initial_response_seq = Some(seq); + } + if status == CodeCellRuntimeStatus::Yielded { + cell.yielded_at_unix_ms = Some(wall_time_unix_ms); + cell.yielded_seq = Some(seq); + } + cell.runtime_status = status; + Ok(()) + } + + /// Ends a code cell, or waits until its queued start can materialize. + /// + /// This mirrors `record_or_queue_code_cell_initial_response`: the reducer is + /// strict about unknown cells, but a cell whose start is pending on the + /// model-visible source item is known and just needs its lifecycle replayed + /// after the source item appears. + pub(super) fn end_or_queue_code_cell( + &mut self, + seq: RawEventSeq, + wall_time_unix_ms: i64, + code_cell_id: CodeCellId, + status: CodeCellRuntimeStatus, + ) -> Result<()> { + if !self.rollout.code_cells.contains_key(&code_cell_id) { + if self.pending_code_cell_starts.contains_key(&code_cell_id) { + self.queue_code_cell_lifecycle_event( + code_cell_id, + PendingCodeCellLifecycleEvent { + seq, + wall_time_unix_ms, + kind: PendingCodeCellLifecycleEventKind::Ended { status }, + }, + ); + return Ok(()); + } + bail!("code cell end referenced unknown cell {code_cell_id}"); + } + self.end_code_cell(seq, wall_time_unix_ms, code_cell_id, status) + } + + fn end_code_cell( + &mut self, + seq: RawEventSeq, + wall_time_unix_ms: i64, + code_cell_id: CodeCellId, + status: CodeCellRuntimeStatus, + ) -> Result<()> { + let Some(cell) = self.rollout.code_cells.get_mut(&code_cell_id) else { + bail!("code cell end referenced unknown cell {code_cell_id}"); + }; + + if cell.initial_response_at_unix_ms.is_none() { + cell.initial_response_at_unix_ms = Some(wall_time_unix_ms); + cell.initial_response_seq = Some(seq); + } + cell.execution.ended_at_unix_ms = Some(wall_time_unix_ms); + cell.execution.ended_seq = Some(seq); + cell.execution.status = execution_status_for_code_cell(&status); + cell.runtime_status = status; + Ok(()) + } + + /// Closes unfinished code cells when their owning turn is interrupted. + /// + /// A yielded code cell can outlive a completed turn and be resumed by a + /// later `wait`, so normal turn completion must not imply cell completion. + /// Cancellation/failure is different: the model-visible JS frame has been + /// abandoned even if nested terminal work reports late runtime events. In + /// that case leaving the cell `running` makes a completed trace look live. + pub(super) fn terminate_running_code_cells_for_turn_end( + &mut self, + seq: RawEventSeq, + wall_time_unix_ms: i64, + codex_turn_id: &str, + turn_status: &ExecutionStatus, + ) -> Result<()> { + let runtime_status = match turn_status { + ExecutionStatus::Running | ExecutionStatus::Completed => return Ok(()), + ExecutionStatus::Failed => CodeCellRuntimeStatus::Failed, + ExecutionStatus::Cancelled | ExecutionStatus::Aborted => { + CodeCellRuntimeStatus::Terminated + } + }; + let code_cell_ids: Vec<_> = self + .rollout + .code_cells + .values() + .filter(|cell| { + cell.codex_turn_id == codex_turn_id + && cell.execution.status == ExecutionStatus::Running + }) + .map(|cell| cell.code_cell_id.clone()) + .collect(); + + for code_cell_id in code_cell_ids { + self.end_code_cell(seq, wall_time_unix_ms, code_cell_id, runtime_status.clone())?; + } + Ok(()) + } + + fn queue_code_cell_lifecycle_event( + &mut self, + code_cell_id: CodeCellId, + event: PendingCodeCellLifecycleEvent, + ) { + let events = self + .pending_code_cell_lifecycle_events + .entry(code_cell_id) + .or_default(); + events.push(event); + events.sort_by_key(|event| event.seq); + } + + fn flush_pending_code_cell_lifecycle_events(&mut self, code_cell_id: &str) -> Result<()> { + let Some(events) = self.pending_code_cell_lifecycle_events.remove(code_cell_id) else { + return Ok(()); + }; + for event in events { + match event.kind { + PendingCodeCellLifecycleEventKind::InitialResponse { + runtime_cell_id, + status, + } => self.record_code_cell_initial_response( + event.seq, + event.wall_time_unix_ms, + code_cell_id.to_string(), + runtime_cell_id, + status, + )?, + PendingCodeCellLifecycleEventKind::Ended { status } => self.end_code_cell( + event.seq, + event.wall_time_unix_ms, + code_cell_id.to_string(), + status, + )?, + } + } + Ok(()) + } + + /// Links a nested tool call back to its parent code cell. + /// + /// If the parent cell is still queued, the link is recovered later from already + /// reduced tool calls when the cell materializes. + pub(super) fn link_tool_call_to_code_cell( + &mut self, + tool_call_id: &ToolCallId, + requester: &ToolCallRequester, + ) -> Result<()> { + let ToolCallRequester::CodeCell { code_cell_id } = requester else { + return Ok(()); + }; + let Some(cell) = self.rollout.code_cells.get_mut(code_cell_id) else { + // The cell start may still be queued behind the inference payload + // that contains its model-visible source item. `start_code_cell` + // backfills these already-reduced nested calls once the source + // ownership can be proven. + return Ok(()); + }; + push_unique(&mut cell.nested_tool_call_ids, tool_call_id); + Ok(()) + } + + /// Records that a model-visible wait call is waiting on a runtime code cell. + /// + /// Wait calls are not nested JavaScript tools, so the relationship is inferred + /// from the runtime cell id inside the function arguments. + pub(super) fn link_wait_tool_call_from_request_payload( + &mut self, + thread_id: &str, + tool_call_id: &ToolCallId, + request_payload: Option<&RawPayloadRef>, + ) -> Result<()> { + let Some(request_payload) = request_payload else { + return Ok(()); + }; + let payload = self.read_payload_json(request_payload)?; + if payload.get("tool_name").and_then(Value::as_str) != Some("wait") { + return Ok(()); + } + // `wait` is a normal model-visible function call, not a nested JS tool + // request. The only stable edge back to the code cell is the runtime + // `cell_id` inside the function arguments. + let Some(arguments) = payload + .get("payload") + .and_then(|payload| payload.get("arguments")) + .and_then(Value::as_str) + else { + bail!( + "wait tool request payload {} did not contain function arguments", + request_payload.raw_payload_id + ); + }; + let arguments: Value = serde_json::from_str(arguments).with_context(|| { + format!( + "wait tool request payload {} had invalid JSON arguments", + request_payload.raw_payload_id + ) + })?; + let Some(runtime_cell_id) = arguments.get("cell_id").and_then(Value::as_str) else { + bail!( + "wait tool request payload {} did not contain cell_id", + request_payload.raw_payload_id + ); + }; + let Some(code_cell_id) = + self.code_cell_id_for_runtime_cell_id_if_known(thread_id, runtime_cell_id) + else { + return Ok(()); + }; + let Some(cell) = self.rollout.code_cells.get_mut(&code_cell_id) else { + return Ok(()); + }; + push_unique(&mut cell.wait_tool_call_ids, tool_call_id); + Ok(()) + } + + /// Attaches a later-observed model-visible output item to its code cell. + /// + /// This is used when an inference request carries a custom-tool output after + /// the runtime cell already exists. + pub(super) fn attach_model_visible_code_cell_item( + &mut self, + item_id: &str, + call_id: Option<&str>, + kind: &ConversationItemKind, + ) -> Result<()> { + let Some(call_id) = call_id else { + return Ok(()); + }; + if *kind != ConversationItemKind::CustomToolCallOutput { + return Ok(()); + } + // The output item can be observed after the CodeCell was created, e.g. + // when a later inference request carries the custom-tool result back to + // the model. Add the reverse ProducerRef at that later observation + // point instead of copying runtime bytes into the conversation model. + let code_cell_id = self.reduced_code_cell_id_for_model_visible_call(call_id); + if !self.rollout.code_cells.contains_key(&code_cell_id) { + return Ok(()); + } + self.add_code_cell_output_item(&code_cell_id, item_id) + } + + /// Resolves the owning thread for a code-cell runtime event. + /// + /// Runtime events should carry a thread id, but older/raw paths may only have + /// the turn id. The fallback keeps replay strict while avoiding duplicate logic + /// in every code-cell event arm. + pub(super) fn code_cell_event_thread_id( + &self, + thread_id: Option, + codex_turn_id: Option<&str>, + runtime_cell_id: &str, + event_name: &str, + ) -> Result { + if let Some(thread_id) = thread_id { + return Ok(thread_id); + } + let Some(codex_turn_id) = codex_turn_id else { + bail!("{event_name} {runtime_cell_id} did not include a thread id"); + }; + self.rollout + .codex_turns + .get(codex_turn_id) + .map(|turn| turn.thread_id.clone()) + .with_context(|| { + format!( + "{event_name} {runtime_cell_id} referenced unknown Codex turn {codex_turn_id}" + ) + }) + } + + /// Derives the stable reduced code-cell id from the model-visible exec call id. + pub(super) fn reduced_code_cell_id_for_model_visible_call( + &self, + model_visible_call_id: &str, + ) -> CodeCellId { + // The model-visible `exec` call is the durable source identity. The + // runtime `cell_id` is only a thread-local handle used for later waits + // and nested tool calls. + format!("code_cell:{model_visible_call_id}") + } + + /// Records the thread-local runtime cell id to reduced code-cell id mapping. + /// + /// Runtime ids can repeat across threads, so callers must provide the owning + /// thread id when creating or resolving this bridge. + pub(super) fn record_runtime_code_cell_id( + &mut self, + thread_id: &str, + runtime_cell_id: &str, + code_cell_id: &str, + ) -> Result<()> { + let key = runtime_code_cell_key(thread_id, runtime_cell_id); + if let Some(existing) = self.code_cell_ids_by_runtime.get(&key) { + if existing == code_cell_id { + return Ok(()); + } + bail!( + "runtime code cell {runtime_cell_id} in thread {thread_id} mapped to both \ + {existing} and {code_cell_id}" + ); + } + self.code_cell_ids_by_runtime + .insert(key, code_cell_id.to_string()); + Ok(()) + } + + /// Resolves a runtime cell id to the reduced code-cell id for the given thread. + pub(super) fn code_cell_id_for_runtime_cell_id( + &self, + thread_id: &str, + runtime_cell_id: &str, + event_name: &str, + ) -> Result { + self.code_cell_id_for_runtime_cell_id_if_known(thread_id, runtime_cell_id) + .with_context(|| { + format!( + "{event_name} referenced unknown runtime cell {runtime_cell_id} \ + in thread {thread_id}" + ) + }) + } + + fn code_cell_id_for_runtime_cell_id_if_known( + &self, + thread_id: &str, + runtime_cell_id: &str, + ) -> Option { + self.code_cell_ids_by_runtime + .get(&runtime_code_cell_key(thread_id, runtime_cell_id)) + .cloned() + } + + /// Converts a raw tool requester into the reduced graph requester. + /// + /// Code-mode tool requests arrive with a runtime cell id, so this method is + /// the boundary that turns that runtime handle into a stable code-cell anchor. + pub(super) fn reduce_tool_call_requester( + &self, + thread_id: &str, + requester: RawToolCallRequester, + ) -> Result { + match requester { + RawToolCallRequester::Model => Ok(ToolCallRequester::Model), + RawToolCallRequester::CodeCell { runtime_cell_id } => Ok(ToolCallRequester::CodeCell { + code_cell_id: self.code_cell_id_for_runtime_cell_id( + thread_id, + &runtime_cell_id, + "code-mode nested tool", + )?, + }), + } + } + + fn validate_code_cell_turn(&self, thread_id: &str, codex_turn_id: &str) -> Result<()> { + if !self.rollout.threads.contains_key(thread_id) { + bail!("code cell start referenced unknown thread {thread_id}"); + } + let Some(turn) = self.rollout.codex_turns.get(codex_turn_id) else { + bail!("code cell start referenced unknown Codex turn {codex_turn_id}"); + }; + if turn.thread_id != thread_id { + bail!( + "code cell start used thread {thread_id}, but Codex turn {codex_turn_id} belongs \ + to {}", + turn.thread_id + ); + } + Ok(()) + } + + fn model_visible_code_cell_item_ids( + &self, + thread_id: &str, + call_id: &str, + kind: ConversationItemKind, + ) -> Vec { + self.rollout + .conversation_items + .values() + .filter(|item| { + item.thread_id == thread_id + && item.call_id.as_deref() == Some(call_id) + && item.kind == kind + }) + .map(|item| item.item_id.clone()) + .collect() + } + + fn source_item_id_for_code_cell_start( + &self, + thread_id: &str, + code_cell_id: &str, + model_visible_call_id: &str, + ) -> Result { + self.model_visible_code_cell_item_ids( + thread_id, + model_visible_call_id, + ConversationItemKind::CustomToolCall, + ) + .into_iter() + .next() + .with_context(|| { + format!( + "code cell {code_cell_id} referenced model-visible call {model_visible_call_id}, \ + but no custom tool call item was observed" + ) + }) + } + + fn add_code_cell_output_item(&mut self, code_cell_id: &str, item_id: &str) -> Result<()> { + let Some(cell) = self.rollout.code_cells.get_mut(code_cell_id) else { + bail!("code cell {code_cell_id} disappeared during output linking"); + }; + push_unique(&mut cell.output_item_ids, item_id); + + let Some(item) = self.rollout.conversation_items.get_mut(item_id) else { + bail!("conversation item {item_id} disappeared during code-cell output linking"); + }; + let producer = ProducerRef::CodeCell { + code_cell_id: code_cell_id.to_string(), + }; + if !item.produced_by.contains(&producer) { + item.produced_by.push(producer); + } + Ok(()) + } +} + +fn execution_status_for_code_cell(status: &CodeCellRuntimeStatus) -> ExecutionStatus { + match status { + CodeCellRuntimeStatus::Starting + | CodeCellRuntimeStatus::Running + | CodeCellRuntimeStatus::Yielded => ExecutionStatus::Running, + CodeCellRuntimeStatus::Completed => ExecutionStatus::Completed, + CodeCellRuntimeStatus::Failed => ExecutionStatus::Failed, + CodeCellRuntimeStatus::Terminated => ExecutionStatus::Cancelled, + } +} + +fn push_unique(items: &mut Vec, item_id: &str) { + if !items.iter().any(|existing| existing == item_id) { + items.push(item_id.to_string()); + } +} + +fn runtime_code_cell_key(thread_id: &str, runtime_cell_id: &str) -> (String, String) { + (thread_id.to_string(), runtime_cell_id.to_string()) +} + +#[cfg(test)] +#[path = "code_cell_tests.rs"] +mod tests; diff --git a/codex-rs/rollout-trace/src/reducer/code_cell_tests.rs b/codex-rs/rollout-trace/src/reducer/code_cell_tests.rs new file mode 100644 index 000000000000..f77921352e77 --- /dev/null +++ b/codex-rs/rollout-trace/src/reducer/code_cell_tests.rs @@ -0,0 +1,423 @@ +use pretty_assertions::assert_eq; +use serde_json::json; +use tempfile::TempDir; + +use crate::model::CodeCellRuntimeStatus; +use crate::model::ConversationItemKind; +use crate::model::ExecutionStatus; +use crate::model::ProducerRef; +use crate::model::ToolCallKind; +use crate::model::ToolCallSummary; +use crate::payload::RawPayloadKind; +use crate::raw_event::RawToolCallRequester; +use crate::raw_event::RawTraceEventPayload; +use crate::reducer::test_support::create_started_writer; +use crate::reducer::test_support::message; +use crate::reducer::test_support::start_turn; +use crate::reducer::test_support::start_turn_for_thread; +use crate::reducer::test_support::trace_context; +use crate::reducer::test_support::trace_context_for_thread; +use crate::replay_bundle; + +#[test] +fn code_cell_lifecycle_links_nested_tools_waits_and_outputs() -> anyhow::Result<()> { + let temp = TempDir::new()?; + let writer = create_started_writer(&temp)?; + start_turn(&writer, "turn-1")?; + + let request = writer.write_json_payload( + RawPayloadKind::InferenceRequest, + &json!({ + "input": [message("user", "count files")] + }), + )?; + writer.append(RawTraceEventPayload::InferenceStarted { + inference_call_id: "inference-1".to_string(), + thread_id: "thread-root".to_string(), + codex_turn_id: "turn-1".to_string(), + model: "gpt-test".to_string(), + provider_name: "test-provider".to_string(), + request_payload: request, + })?; + let response = writer.write_json_payload( + RawPayloadKind::InferenceResponse, + &json!({ + "response_id": "resp-1", + "output_items": [{ + "type": "custom_tool_call", + "name": "exec", + "call_id": "call-code", + "input": "text('hi')" + }] + }), + )?; + // Runtime tool dispatch starts before the stream-completion hook has + // reduced the model response that requested `exec`. + writer.append_with_context( + trace_context("turn-1"), + RawTraceEventPayload::CodeCellStarted { + runtime_cell_id: "1".to_string(), + model_visible_call_id: "call-code".to_string(), + source_js: "text('hi')".to_string(), + }, + )?; + writer.append(RawTraceEventPayload::InferenceCompleted { + inference_call_id: "inference-1".to_string(), + response_id: Some("resp-1".to_string()), + response_payload: response, + })?; + writer.append_with_context( + trace_context("turn-1"), + RawTraceEventPayload::CodeCellInitialResponse { + runtime_cell_id: "1".to_string(), + status: CodeCellRuntimeStatus::Yielded, + response_payload: None, + }, + )?; + writer.append_with_context( + trace_context("turn-1"), + RawTraceEventPayload::ToolCallStarted { + tool_call_id: "nested-tool-1".to_string(), + model_visible_call_id: None, + code_mode_runtime_tool_id: Some("tool-1".to_string()), + requester: RawToolCallRequester::CodeCell { + runtime_cell_id: "1".to_string(), + }, + kind: ToolCallKind::ExecCommand, + summary: ToolCallSummary::Generic { + label: "exec_command".to_string(), + input_preview: Some("pwd".to_string()), + output_preview: None, + }, + invocation_payload: None, + }, + )?; + writer.append_with_context( + trace_context("turn-1"), + RawTraceEventPayload::ToolCallEnded { + tool_call_id: "nested-tool-1".to_string(), + status: ExecutionStatus::Completed, + result_payload: None, + }, + )?; + + start_turn(&writer, "turn-2")?; + let followup = writer.write_json_payload( + RawPayloadKind::InferenceRequest, + &json!({ + "previous_response_id": "resp-1", + "input": [{ + "type": "custom_tool_call_output", + "call_id": "call-code", + "output": "Script running with cell ID 1" + }] + }), + )?; + writer.append(RawTraceEventPayload::InferenceStarted { + inference_call_id: "inference-2".to_string(), + thread_id: "thread-root".to_string(), + codex_turn_id: "turn-2".to_string(), + model: "gpt-test".to_string(), + provider_name: "test-provider".to_string(), + request_payload: followup, + })?; + let wait_request = writer.write_json_payload( + RawPayloadKind::ToolInvocation, + &json!({ + "tool_name": "wait", + "tool_namespace": null, + "payload": { + "type": "function", + "arguments": "{\"cell_id\":\"1\"}" + } + }), + )?; + writer.append_with_context( + trace_context("turn-2"), + RawTraceEventPayload::ToolCallStarted { + tool_call_id: "wait-tool-1".to_string(), + model_visible_call_id: Some("wait-call".to_string()), + code_mode_runtime_tool_id: None, + requester: RawToolCallRequester::Model, + kind: ToolCallKind::Other { + name: "wait".to_string(), + }, + summary: ToolCallSummary::Generic { + label: "wait".to_string(), + input_preview: Some("{\"cell_id\":\"1\"}".to_string()), + output_preview: None, + }, + invocation_payload: Some(wait_request), + }, + )?; + writer.append_with_context( + trace_context("turn-2"), + RawTraceEventPayload::CodeCellEnded { + runtime_cell_id: "1".to_string(), + status: CodeCellRuntimeStatus::Completed, + response_payload: None, + }, + )?; + + let rollout = replay_bundle(temp.path())?; + let code_cell_id = test_reduced_code_cell_id("call-code"); + let cell = &rollout.code_cells[&code_cell_id]; + let output_item_id = rollout.inference_calls["inference-2"] + .request_item_ids + .last() + .expect("exec output item"); + + assert_eq!(cell.thread_id, "thread-root"); + assert_eq!(cell.runtime_status, CodeCellRuntimeStatus::Completed); + assert_eq!(cell.execution.status, ExecutionStatus::Completed); + assert_eq!(cell.runtime_cell_id, Some("1".to_string())); + assert_eq!(cell.nested_tool_call_ids, vec!["nested-tool-1"]); + assert_eq!(cell.wait_tool_call_ids, vec!["wait-tool-1"]); + assert_eq!(cell.output_item_ids, vec![output_item_id.clone()]); + assert_eq!( + rollout.conversation_items[output_item_id].produced_by, + vec![ProducerRef::CodeCell { + code_cell_id: code_cell_id.clone(), + }] + ); + assert_eq!( + rollout.conversation_items[&cell.source_item_id].kind, + ConversationItemKind::CustomToolCall, + ); + + Ok(()) +} + +#[test] +fn fast_code_cell_lifecycle_waits_for_source_item() -> anyhow::Result<()> { + let temp = TempDir::new()?; + let writer = create_started_writer(&temp)?; + start_turn(&writer, "turn-1")?; + + let request = writer.write_json_payload( + RawPayloadKind::InferenceRequest, + &json!({ + "input": [message("user", "count files")] + }), + )?; + writer.append(RawTraceEventPayload::InferenceStarted { + inference_call_id: "inference-1".to_string(), + thread_id: "thread-root".to_string(), + codex_turn_id: "turn-1".to_string(), + model: "gpt-test".to_string(), + provider_name: "test-provider".to_string(), + request_payload: request, + })?; + writer.append_with_context( + trace_context("turn-1"), + RawTraceEventPayload::CodeCellStarted { + runtime_cell_id: "1".to_string(), + model_visible_call_id: "call-code".to_string(), + source_js: "not valid js".to_string(), + }, + )?; + writer.append_with_context( + trace_context("turn-1"), + RawTraceEventPayload::CodeCellInitialResponse { + runtime_cell_id: "1".to_string(), + status: CodeCellRuntimeStatus::Failed, + response_payload: None, + }, + )?; + writer.append_with_context( + trace_context("turn-1"), + RawTraceEventPayload::CodeCellEnded { + runtime_cell_id: "1".to_string(), + status: CodeCellRuntimeStatus::Failed, + response_payload: None, + }, + )?; + let response = writer.write_json_payload( + RawPayloadKind::InferenceResponse, + &json!({ + "response_id": "resp-1", + "output_items": [{ + "type": "custom_tool_call", + "name": "exec", + "call_id": "call-code", + "input": "not valid js" + }] + }), + )?; + writer.append(RawTraceEventPayload::InferenceCompleted { + inference_call_id: "inference-1".to_string(), + response_id: Some("resp-1".to_string()), + response_payload: response, + })?; + + let rollout = replay_bundle(temp.path())?; + let code_cell_id = test_reduced_code_cell_id("call-code"); + let cell = &rollout.code_cells[&code_cell_id]; + + assert_eq!(cell.thread_id, "thread-root"); + assert_eq!(cell.runtime_status, CodeCellRuntimeStatus::Failed); + assert_eq!(cell.execution.status, ExecutionStatus::Failed); + assert_eq!(cell.runtime_cell_id, Some("1".to_string())); + assert_eq!( + rollout.conversation_items[&cell.source_item_id].kind, + ConversationItemKind::CustomToolCall, + ); + + Ok(()) +} + +#[test] +fn cancelled_turn_terminates_unfinished_code_cell() -> anyhow::Result<()> { + let temp = TempDir::new()?; + let writer = create_started_writer(&temp)?; + start_turn(&writer, "turn-1")?; + + let request = writer.write_json_payload( + RawPayloadKind::InferenceRequest, + &json!({ + "input": [message("user", "count files")] + }), + )?; + writer.append(RawTraceEventPayload::InferenceStarted { + inference_call_id: "inference-1".to_string(), + thread_id: "thread-root".to_string(), + codex_turn_id: "turn-1".to_string(), + model: "gpt-test".to_string(), + provider_name: "test-provider".to_string(), + request_payload: request, + })?; + let response = writer.write_json_payload( + RawPayloadKind::InferenceResponse, + &json!({ + "response_id": "resp-1", + "output_items": [{ + "type": "custom_tool_call", + "name": "exec", + "call_id": "call-code", + "input": "await tools.exec_command({cmd: 'slow'});" + }] + }), + )?; + writer.append(RawTraceEventPayload::InferenceCompleted { + inference_call_id: "inference-1".to_string(), + response_id: Some("resp-1".to_string()), + response_payload: response, + })?; + writer.append_with_context( + trace_context("turn-1"), + RawTraceEventPayload::CodeCellStarted { + runtime_cell_id: "1".to_string(), + model_visible_call_id: "call-code".to_string(), + source_js: "await tools.exec_command({cmd: 'slow'});".to_string(), + }, + )?; + let turn_end = writer.append_with_context( + trace_context("turn-1"), + RawTraceEventPayload::CodexTurnEnded { + codex_turn_id: "turn-1".to_string(), + status: ExecutionStatus::Cancelled, + }, + )?; + + let rollout = replay_bundle(temp.path())?; + let code_cell_id = test_reduced_code_cell_id("call-code"); + let cell = &rollout.code_cells[&code_cell_id]; + + assert_eq!(cell.runtime_status, CodeCellRuntimeStatus::Terminated); + assert_eq!(cell.execution.status, ExecutionStatus::Cancelled); + assert_eq!(cell.execution.ended_seq, Some(turn_end.seq)); + + Ok(()) +} + +#[test] +fn runtime_code_cell_ids_can_repeat_across_threads() -> anyhow::Result<()> { + let temp = TempDir::new()?; + let writer = create_started_writer(&temp)?; + writer.append(RawTraceEventPayload::ThreadStarted { + thread_id: "thread-child".to_string(), + agent_path: "/root/child".to_string(), + metadata_payload: None, + })?; + start_turn_for_thread(&writer, "thread-root", "turn-root")?; + start_turn_for_thread(&writer, "thread-child", "turn-child")?; + + for (thread_id, turn_id, inference_call_id, call_id) in [ + ("thread-root", "turn-root", "inference-root", "call-root"), + ( + "thread-child", + "turn-child", + "inference-child", + "call-child", + ), + ] { + let request = writer.write_json_payload( + RawPayloadKind::InferenceRequest, + &json!({ + "input": [message("user", "run code")] + }), + )?; + writer.append(RawTraceEventPayload::InferenceStarted { + inference_call_id: inference_call_id.to_string(), + thread_id: thread_id.to_string(), + codex_turn_id: turn_id.to_string(), + model: "gpt-test".to_string(), + provider_name: "test-provider".to_string(), + request_payload: request, + })?; + writer.append_with_context( + trace_context_for_thread(thread_id, turn_id), + RawTraceEventPayload::CodeCellStarted { + runtime_cell_id: "1".to_string(), + model_visible_call_id: call_id.to_string(), + source_js: "text('hi')".to_string(), + }, + )?; + let response = writer.write_json_payload( + RawPayloadKind::InferenceResponse, + &json!({ + "response_id": format!("resp-{thread_id}"), + "output_items": [{ + "type": "custom_tool_call", + "name": "exec", + "call_id": call_id, + "input": "text('hi')" + }] + }), + )?; + writer.append(RawTraceEventPayload::InferenceCompleted { + inference_call_id: inference_call_id.to_string(), + response_id: Some(format!("resp-{thread_id}")), + response_payload: response, + })?; + writer.append_with_context( + trace_context_for_thread(thread_id, turn_id), + RawTraceEventPayload::CodeCellEnded { + runtime_cell_id: "1".to_string(), + status: CodeCellRuntimeStatus::Completed, + response_payload: None, + }, + )?; + } + + let rollout = replay_bundle(temp.path())?; + let root_cell_id = test_reduced_code_cell_id("call-root"); + let child_cell_id = test_reduced_code_cell_id("call-child"); + + assert_eq!(rollout.code_cells[&root_cell_id].thread_id, "thread-root"); + assert_eq!(rollout.code_cells[&child_cell_id].thread_id, "thread-child"); + assert_eq!( + rollout.code_cells[&root_cell_id].runtime_cell_id, + Some("1".to_string()) + ); + assert_eq!( + rollout.code_cells[&child_cell_id].runtime_cell_id, + Some("1".to_string()) + ); + + Ok(()) +} + +fn test_reduced_code_cell_id(model_visible_call_id: &str) -> String { + format!("code_cell:{model_visible_call_id}") +} diff --git a/codex-rs/rollout-trace/src/reducer/compaction.rs b/codex-rs/rollout-trace/src/reducer/compaction.rs new file mode 100644 index 000000000000..f29b5dfa8282 --- /dev/null +++ b/codex-rs/rollout-trace/src/reducer/compaction.rs @@ -0,0 +1,183 @@ +//! Reducer support for the remote compaction lifecycle. +//! +//! This module owns request/checkpoint bookkeeping. Conversation item reconciliation stays in +//! `conversation` because it depends on the same normalization and reuse invariants as inference +//! requests. + +use anyhow::Result; +use anyhow::bail; + +use super::TraceReducer; +use crate::model::Compaction; +use crate::model::CompactionRequest; +use crate::model::CompactionRequestId; +use crate::model::ExecutionStatus; +use crate::model::ExecutionWindow; +use crate::payload::RawPayloadRef; +use crate::raw_event::RawEventSeq; + +impl TraceReducer { + /// Starts one upstream request attempt for a compaction operation. + pub(super) fn start_compaction_request( + &mut self, + seq: RawEventSeq, + wall_time_unix_ms: i64, + started: StartedCompactionRequest, + ) -> Result<()> { + if self + .rollout + .compaction_requests + .contains_key(&started.compaction_request_id) + { + bail!( + "duplicate compaction request start for {}", + started.compaction_request_id + ); + } + self.thread_mut(&started.thread_id)?; + let Some(turn) = self.rollout.codex_turns.get(&started.codex_turn_id) else { + bail!( + "compaction request {} referenced unknown codex turn {}", + started.compaction_request_id, + started.codex_turn_id + ); + }; + if turn.thread_id != started.thread_id { + bail!( + "compaction request {} used thread {}, but codex turn {} belongs to {}", + started.compaction_request_id, + started.thread_id, + started.codex_turn_id, + turn.thread_id + ); + } + + self.rollout.compaction_requests.insert( + started.compaction_request_id.clone(), + CompactionRequest { + compaction_request_id: started.compaction_request_id, + compaction_id: started.compaction_id, + thread_id: started.thread_id, + codex_turn_id: started.codex_turn_id, + execution: ExecutionWindow { + started_at_unix_ms: wall_time_unix_ms, + started_seq: seq, + ended_at_unix_ms: None, + ended_seq: None, + status: ExecutionStatus::Running, + }, + model: started.model, + provider_name: started.provider_name, + raw_request_payload_id: started.request_payload.raw_payload_id, + raw_response_payload_id: None, + }, + ); + Ok(()) + } + + /// Completes an upstream compaction request attempt without modifying conversation history. + /// + /// The request/response payloads are evidence for the remote call. The live + /// conversation changes only when a separate install event provides the checkpoint. + pub(super) fn complete_compaction_request( + &mut self, + seq: RawEventSeq, + wall_time_unix_ms: i64, + compaction_id: String, + compaction_request_id: CompactionRequestId, + status: ExecutionStatus, + response_payload: Option, + ) -> Result<()> { + let Some(request) = self + .rollout + .compaction_requests + .get_mut(&compaction_request_id) + else { + bail!( + "compaction request completion referenced unknown request {compaction_request_id}" + ); + }; + if request.compaction_id != compaction_id { + bail!( + "compaction request {compaction_request_id} completion used compaction {compaction_id}, but start used {}", + request.compaction_id + ); + } + request.execution.ended_at_unix_ms = Some(wall_time_unix_ms); + request.execution.ended_seq = Some(seq); + request.execution.status = status; + request.raw_response_payload_id = response_payload.map(|payload| payload.raw_payload_id); + Ok(()) + } + + /// Installs a compaction checkpoint into the reduced conversation graph. + /// + /// This is the semantic boundary where replacement history becomes the live + /// thread history; request attempts alone do not imply that change. + pub(super) fn reduce_compaction_installed_event( + &mut self, + wall_time_unix_ms: i64, + thread_id: String, + codex_turn_id: String, + compaction_id: String, + checkpoint_payload: RawPayloadRef, + ) -> Result<()> { + if self.rollout.compactions.contains_key(&compaction_id) { + bail!("duplicate compaction install for {compaction_id}"); + } + self.thread_mut(&thread_id)?; + let Some(turn) = self.rollout.codex_turns.get(&codex_turn_id) else { + bail!( + "compaction install {compaction_id} referenced unknown codex turn {codex_turn_id}" + ); + }; + if turn.thread_id != thread_id { + bail!( + "compaction install {compaction_id} used thread {thread_id}, but codex turn {codex_turn_id} belongs to {}", + turn.thread_id + ); + } + let checkpoint = self.reduce_compaction_checkpoint( + wall_time_unix_ms, + &thread_id, + codex_turn_id.as_str(), + &compaction_id, + &checkpoint_payload, + )?; + let request_ids = self + .rollout + .compaction_requests + .values() + .filter(|request| request.compaction_id == compaction_id) + .map(|request| request.compaction_request_id.clone()) + .collect(); + + self.pending_compaction_replacement_item_ids + .insert(thread_id.clone(), checkpoint.replacement_item_ids.clone()); + self.rollout.compactions.insert( + compaction_id.clone(), + Compaction { + compaction_id, + thread_id, + codex_turn_id, + installed_at_unix_ms: wall_time_unix_ms, + marker_item_id: checkpoint.marker_item_id, + request_ids, + input_item_ids: checkpoint.input_item_ids, + replacement_item_ids: checkpoint.replacement_item_ids, + }, + ); + Ok(()) + } +} + +/// Raw compaction-request start fields after dispatch has stripped the event envelope. +pub(super) struct StartedCompactionRequest { + pub(super) compaction_id: String, + pub(super) compaction_request_id: String, + pub(super) thread_id: String, + pub(super) codex_turn_id: String, + pub(super) model: String, + pub(super) provider_name: String, + pub(super) request_payload: RawPayloadRef, +} diff --git a/codex-rs/rollout-trace/src/reducer/conversation.rs b/codex-rs/rollout-trace/src/reducer/conversation.rs new file mode 100644 index 000000000000..53ae60824da2 --- /dev/null +++ b/codex-rs/rollout-trace/src/reducer/conversation.rs @@ -0,0 +1,700 @@ +//! Conversation reduction from model-facing payload snapshots. +//! +//! Inference request inputs and response outputs are both part of the logical +//! conversation because they are the payloads exchanged with the model. Runtime +//! observations, such as local tool output, stay outside the transcript until a +//! later model-facing payload carries their content. + +use anyhow::Context; +use anyhow::Result; +use anyhow::bail; +use serde_json::Value; + +use self::normalize::NormalizedConversationItem; +use super::TraceReducer; +use crate::model::CompactionId; +use crate::model::ConversationBody; +use crate::model::ConversationItem; +use crate::model::ConversationItemKind; +use crate::model::ConversationPart; +use crate::model::ConversationRole; +use crate::model::InferenceCallId; +use crate::model::ProducerRef; +use crate::payload::RawPayloadRef; + +mod normalize; + +impl TraceReducer { + /// Reduces an inference request input snapshot into model-visible conversation items. + /// + /// Request snapshots are reconciled by position against the previous model-visible + /// snapshot for the thread so repeated history reuses ids while newly inserted + /// items remain distinct. + pub(super) fn reduce_inference_request( + &mut self, + wall_time_unix_ms: i64, + inference_call_id: &InferenceCallId, + thread_id: &str, + codex_turn_id: &str, + request_payload: &RawPayloadRef, + ) -> Result> { + let payload = self.read_payload_json(request_payload)?; + let Some(input) = payload.get("input") else { + bail!( + "inference request payload {} did not contain input", + request_payload.raw_payload_id + ); + }; + let Some(request_items) = input.as_array() else { + bail!( + "inference request payload {} had non-array input", + request_payload.raw_payload_id + ); + }; + + let items = normalize::normalize_model_items(request_items, request_payload)?; + + let previous_response_id = payload.get("previous_response_id").and_then(Value::as_str); + // After compaction, the next full request is compared against the installed replacement + // history, not the pre-compaction prompt. Any repeated developer/context prefix that Codex + // reinjects must therefore become a fresh post-compaction conversation item. + let post_compaction_snapshot = if previous_response_id.is_none() { + self.pending_compaction_replacement_item_ids + .get(thread_id) + .cloned() + } else { + None + }; + let request_item_ids = if let Some(previous_response_id) = previous_response_id { + // Streaming follow-up requests can send only the new input plus a + // `previous_response_id`. The trace model still exposes the full + // model-visible input, so rebuild the omitted prefix from the + // previous request and response before reducing this delta. + let previous_items = self + .rollout + .inference_calls + .values() + .find(|inference| { + inference.thread_id == thread_id + && inference.upstream_request_id.as_deref() == Some(previous_response_id) + }) + .map(|inference| { + let mut ids = inference.request_item_ids.clone(); + ids.extend(inference.response_item_ids.clone()); + ids + }); + let Some(mut item_ids) = previous_items else { + bail!( + "incremental inference request {inference_call_id} referenced unknown previous_response_id {previous_response_id}" + ); + }; + let delta_item_ids = self.reconcile_conversation_items( + items, + ReconcileItems { + thread_id, + codex_turn_id, + wall_time_unix_ms, + produced_by: Vec::new(), + start_index: item_ids.len(), + mode: ReconcileMode::AppendOnly, + snapshot_override: None, + }, + )?; + item_ids.extend(delta_item_ids); + item_ids + } else { + self.reconcile_conversation_items( + items, + ReconcileItems { + thread_id, + codex_turn_id, + wall_time_unix_ms, + produced_by: Vec::new(), + start_index: 0, + mode: ReconcileMode::FullSnapshot, + snapshot_override: post_compaction_snapshot.as_deref(), + }, + )? + }; + + self.append_thread_conversation_items(thread_id, &request_item_ids)?; + if post_compaction_snapshot.is_some() { + self.pending_compaction_replacement_item_ids + .remove(thread_id); + } + self.thread_conversation_snapshots + .insert(thread_id.to_string(), request_item_ids.clone()); + Ok(request_item_ids) + } + + /// Reduces an inference response payload into conversation items produced by the call. + pub(super) fn reduce_inference_response( + &mut self, + wall_time_unix_ms: i64, + inference_call_id: &InferenceCallId, + response_payload: &RawPayloadRef, + ) -> Result> { + let payload = self.read_payload_json(response_payload)?; + let Some(output_items) = payload.get("output_items").and_then(Value::as_array) else { + bail!( + "inference response payload {} did not contain output_items", + response_payload.raw_payload_id + ); + }; + + let Some((thread_id, codex_turn_id)) = self + .rollout + .inference_calls + .get(inference_call_id) + .map(|inference| (inference.thread_id.clone(), inference.codex_turn_id.clone())) + else { + bail!("inference response referenced unknown call {inference_call_id}"); + }; + + let items = normalize::normalize_model_items(output_items, response_payload)?; + // Response output is appended immediately: it was produced by the model, + // so it is conversation even before a later request carries it forward. + let append_at = self + .thread_conversation_snapshots + .get(&thread_id) + .map_or(0, Vec::len); + let response_item_ids = self.reconcile_conversation_items( + items, + ReconcileItems { + thread_id: &thread_id, + codex_turn_id: &codex_turn_id, + wall_time_unix_ms, + produced_by: vec![ProducerRef::Inference { + inference_call_id: inference_call_id.clone(), + }], + start_index: append_at, + mode: ReconcileMode::AppendOnly, + snapshot_override: None, + }, + )?; + self.append_thread_conversation_items(&thread_id, &response_item_ids)?; + self.thread_conversation_snapshots + .entry(thread_id) + .or_default() + .extend(response_item_ids.clone()); + + if let Some(usage) = payload + .get("token_usage") + .and_then(normalize::token_usage_from_value) + && let Some(inference) = self.rollout.inference_calls.get_mut(inference_call_id) + { + inference.usage = Some(usage); + } + + Ok(response_item_ids) + } + + fn reconcile_conversation_items( + &mut self, + items: Vec, + context: ReconcileItems<'_>, + ) -> Result> { + let previous_snapshot = context.snapshot_override.map_or_else( + || { + self.thread_conversation_snapshots + .get(context.thread_id) + .cloned() + .unwrap_or_default() + }, + <[_]>::to_vec, + ); + let mut item_ids = Vec::with_capacity(items.len()); + + for (offset, item) in items.into_iter().enumerate() { + let index = context.start_index + offset; + let tool_link_item = item.clone(); + self.ensure_call_id_consistency(context.thread_id, &item)?; + self.ensure_reasoning_consistency(context.thread_id, &item)?; + let item_id = if let Some(previous_item_id) = previous_snapshot.get(index) { + if self.item_matches(previous_item_id, &item) { + previous_item_id.clone() + } else if matches!(context.mode, ReconcileMode::FullSnapshot) { + self.find_matching_snapshot_item(&previous_snapshot, &item_ids, &item) + .unwrap_or_else(|| { + self.create_conversation_item( + context.thread_id, + Some(context.codex_turn_id.to_string()), + context.wall_time_unix_ms, + item, + context.produced_by.clone(), + ) + }) + } else { + let codex_turn_id = context.codex_turn_id; + let thread_id = context.thread_id; + bail!( + "model conversation mismatch while reducing turn {codex_turn_id} for \ + thread {thread_id} at item index {index}: existing item \ + {previous_item_id} does not match the current model payload item" + ); + } + } else if matches!(context.mode, ReconcileMode::FullSnapshot) { + self.find_matching_snapshot_item(&previous_snapshot, &item_ids, &item) + .unwrap_or_else(|| { + self.create_conversation_item( + context.thread_id, + Some(context.codex_turn_id.to_string()), + context.wall_time_unix_ms, + item, + context.produced_by.clone(), + ) + }) + } else { + self.create_conversation_item( + context.thread_id, + Some(context.codex_turn_id.to_string()), + context.wall_time_unix_ms, + item, + context.produced_by.clone(), + ) + }; + self.update_conversation_item_from_sighting( + &item_id, + &tool_link_item, + &context.produced_by, + )?; + self.attach_model_visible_tool_item( + &item_id, + tool_link_item.call_id.as_deref(), + &tool_link_item.kind, + )?; + self.attach_model_visible_code_cell_item( + &item_id, + tool_link_item.call_id.as_deref(), + &tool_link_item.kind, + )?; + self.resolve_pending_agent_edges_for_item(&item_id)?; + item_ids.push(item_id); + } + + self.flush_pending_code_cell_starts()?; + Ok(item_ids) + } + + /// Reduces a compaction checkpoint payload into installed replacement history. + /// + /// The returned ids let the compaction reducer record both the boundary marker + /// and the snapshot that future full requests should reconcile against. + pub(super) fn reduce_compaction_checkpoint( + &mut self, + wall_time_unix_ms: i64, + thread_id: &str, + codex_turn_id: &str, + compaction_id: &CompactionId, + checkpoint_payload: &RawPayloadRef, + ) -> Result { + let payload = self.read_payload_json(checkpoint_payload)?; + let input_history = required_array(&payload, "input_history", checkpoint_payload)?; + let replacement_history = + required_array(&payload, "replacement_history", checkpoint_payload)?; + + let input_items = normalize::normalize_model_items(input_history, checkpoint_payload)?; + let replacement_items = + normalize::normalize_model_items(replacement_history, checkpoint_payload)?; + let input_candidates = self + .thread_conversation_snapshots + .get(thread_id) + .cloned() + .unwrap_or_default(); + let input_item_ids = self.reconcile_detached_conversation_items( + input_items, + DetachedReconcileItems { + thread_id, + codex_turn_id, + wall_time_unix_ms, + produced_by: Vec::new(), + candidates: input_candidates, + }, + )?; + // A compaction checkpoint has two transcript effects. First, record the structural + // boundary where old live history ended. Then append the replacement items, including + // the provider-visible summary item if the compact endpoint returned one. + let marker_item_id = self.create_conversation_item( + thread_id, + Some(codex_turn_id.to_string()), + wall_time_unix_ms, + NormalizedConversationItem { + role: ConversationRole::Assistant, + channel: None, + kind: ConversationItemKind::CompactionMarker, + // The summary is a separate model/provider-visible item. Keep the marker body + // empty so transcript renderers cannot mistake the boundary for prompt content. + body: ConversationBody { parts: Vec::new() }, + call_id: None, + }, + vec![ProducerRef::Compaction { + compaction_id: compaction_id.clone(), + }], + ); + let replacement_item_ids = self.reconcile_detached_conversation_items( + replacement_items, + DetachedReconcileItems { + thread_id, + codex_turn_id, + wall_time_unix_ms, + produced_by: vec![ProducerRef::Compaction { + compaction_id: compaction_id.clone(), + }], + // Replacement history is a rewrite boundary. Even if the compact endpoint emits + // text that matches old history, the installed item is a new post-compaction + // conversation item and should not reuse a pre-compaction ID. + candidates: Vec::new(), + }, + )?; + self.append_thread_conversation_items(thread_id, &input_item_ids)?; + self.append_thread_conversation_items(thread_id, std::slice::from_ref(&marker_item_id))?; + self.append_thread_conversation_items(thread_id, &replacement_item_ids)?; + Ok(ReducedCompactionCheckpoint { + input_item_ids, + marker_item_id, + replacement_item_ids, + }) + } + + fn reconcile_detached_conversation_items( + &mut self, + items: Vec, + context: DetachedReconcileItems<'_>, + ) -> Result> { + let mut item_ids = Vec::with_capacity(items.len()); + + for item in items { + let tool_link_item = item.clone(); + self.ensure_call_id_consistency(context.thread_id, &item)?; + self.ensure_reasoning_consistency(context.thread_id, &item)?; + let item_id = self + .find_matching_snapshot_item(&context.candidates, &item_ids, &item) + .unwrap_or_else(|| { + self.create_conversation_item( + context.thread_id, + Some(context.codex_turn_id.to_string()), + context.wall_time_unix_ms, + item, + context.produced_by.clone(), + ) + }); + self.update_conversation_item_from_sighting( + &item_id, + &tool_link_item, + &context.produced_by, + )?; + self.attach_model_visible_tool_item( + &item_id, + tool_link_item.call_id.as_deref(), + &tool_link_item.kind, + )?; + self.attach_model_visible_code_cell_item( + &item_id, + tool_link_item.call_id.as_deref(), + &tool_link_item.kind, + )?; + self.resolve_pending_agent_edges_for_item(&item_id)?; + item_ids.push(item_id); + } + + self.flush_pending_code_cell_starts()?; + Ok(item_ids) + } + + fn create_conversation_item( + &mut self, + thread_id: &str, + codex_turn_id: Option, + first_seen_at_unix_ms: i64, + item: NormalizedConversationItem, + produced_by: Vec, + ) -> String { + let item_id = self.next_conversation_item_id(); + self.rollout.conversation_items.insert( + item_id.clone(), + ConversationItem { + item_id: item_id.clone(), + thread_id: thread_id.to_string(), + codex_turn_id, + first_seen_at_unix_ms, + role: item.role, + channel: item.channel, + kind: item.kind, + body: item.body, + call_id: item.call_id, + produced_by, + }, + ); + item_id + } + + fn update_conversation_item_from_sighting( + &mut self, + item_id: &str, + normalized: &NormalizedConversationItem, + produced_by: &[ProducerRef], + ) -> Result<()> { + let Some(item) = self.rollout.conversation_items.get_mut(item_id) else { + bail!("conversation item {item_id} was referenced before it was created"); + }; + + if item.kind == ConversationItemKind::Reasoning { + merge_reasoning_body(&mut item.body, &normalized.body)?; + } + for producer in produced_by { + if !item.produced_by.contains(producer) { + item.produced_by.push(producer.clone()); + } + } + Ok(()) + } + + fn append_thread_conversation_items( + &mut self, + thread_id: &str, + item_ids: &[String], + ) -> Result<()> { + let thread = self.thread_mut(thread_id)?; + for item_id in item_ids { + if !thread.conversation_item_ids.contains(item_id) { + thread.conversation_item_ids.push(item_id.clone()); + } + } + Ok(()) + } + + fn find_matching_snapshot_item( + &self, + previous_snapshot: &[String], + used_item_ids: &[String], + normalized: &NormalizedConversationItem, + ) -> Option { + previous_snapshot + .iter() + .find(|item_id| { + !used_item_ids.contains(item_id) && self.item_matches(item_id, normalized) + }) + .cloned() + } + + fn ensure_call_id_consistency( + &self, + thread_id: &str, + normalized: &NormalizedConversationItem, + ) -> Result<()> { + let Some(call_id) = normalized.call_id.as_deref() else { + return Ok(()); + }; + for item in self.rollout.conversation_items.values() { + if item.thread_id == thread_id + && item.call_id.as_deref() == Some(call_id) + && item.kind == normalized.kind + && !conversation_item_matches(item, normalized) + { + bail!("model-visible call id {call_id} was reused with different content"); + } + } + Ok(()) + } + + fn ensure_reasoning_consistency( + &self, + thread_id: &str, + normalized: &NormalizedConversationItem, + ) -> Result<()> { + if normalized.kind != ConversationItemKind::Reasoning { + return Ok(()); + }; + let Some((label, value)) = reasoning_encoded_part(&normalized.body) else { + return Ok(()); + }; + + for item in self.rollout.conversation_items.values() { + if item.thread_id == thread_id + && item.kind == ConversationItemKind::Reasoning + && item.channel == normalized.channel + && reasoning_encoded_part(&item.body) == Some((label, value)) + && !reasoning_body_matches(&item.body, &normalized.body) + { + bail!("reasoning encrypted_content was reused with different readable content"); + } + } + Ok(()) + } + + fn item_matches(&self, item_id: &str, normalized: &NormalizedConversationItem) -> bool { + let Some(item) = self.rollout.conversation_items.get(item_id) else { + return false; + }; + conversation_item_matches(item, normalized) + } + + fn next_conversation_item_id(&mut self) -> String { + let ordinal = self.next_conversation_item_ordinal; + self.next_conversation_item_ordinal += 1; + format!("conversation_item:{ordinal}") + } +} + +#[derive(Clone, Copy)] +enum ReconcileMode { + /// Full model requests are authoritative snapshots of the live context. The + /// prompt builder can reorder already-observed items or replace history + /// with synthetic summary messages, so item identity is "same content, + /// reused at most once in this snapshot" rather than "same position only". + FullSnapshot, + /// Incremental request deltas and response outputs append to a known prefix. + /// A mismatch at an occupied position means our reconstructed prefix is + /// wrong and should fail replay. + AppendOnly, +} + +struct ReconcileItems<'a> { + thread_id: &'a str, + codex_turn_id: &'a str, + wall_time_unix_ms: i64, + produced_by: Vec, + start_index: usize, + mode: ReconcileMode, + snapshot_override: Option<&'a [String]>, +} + +struct DetachedReconcileItems<'a> { + thread_id: &'a str, + codex_turn_id: &'a str, + wall_time_unix_ms: i64, + produced_by: Vec, + candidates: Vec, +} + +/// Conversation ids produced when a compaction checkpoint is installed. +/// +/// The marker item records the boundary, while replacement items are the live +/// history that subsequent full requests should treat as their baseline. +pub(super) struct ReducedCompactionCheckpoint { + pub(super) input_item_ids: Vec, + pub(super) marker_item_id: String, + pub(super) replacement_item_ids: Vec, +} + +fn required_array<'a>( + payload: &'a Value, + key: &str, + raw_payload: &RawPayloadRef, +) -> Result<&'a Vec> { + payload.get(key).and_then(Value::as_array).with_context(|| { + format!( + "compaction checkpoint payload {} did not contain array {key}", + raw_payload.raw_payload_id + ) + }) +} + +fn conversation_item_matches( + item: &ConversationItem, + normalized: &NormalizedConversationItem, +) -> bool { + let body_matches = if item.kind == ConversationItemKind::Reasoning + && normalized.kind == ConversationItemKind::Reasoning + { + reasoning_body_matches(&item.body, &normalized.body) + } else { + conversation_body_matches(&item.body, &normalized.body) + }; + + item.role == normalized.role + && item.channel == normalized.channel + && item.kind == normalized.kind + && body_matches + && item.call_id == normalized.call_id +} + +fn conversation_body_matches(left: &ConversationBody, right: &ConversationBody) -> bool { + left.parts.len() == right.parts.len() + && left + .parts + .iter() + .zip(&right.parts) + .all(|(left, right)| match (left, right) { + ( + ConversationPart::Json { + summary: left_summary, + raw_payload_id: _, + }, + ConversationPart::Json { + summary: right_summary, + raw_payload_id: _, + }, + ) => left_summary == right_summary, + _ => left == right, + }) +} + +fn reasoning_body_matches(left: &ConversationBody, right: &ConversationBody) -> bool { + if conversation_body_matches(left, right) { + return true; + } + + // The Responses API may return readable reasoning on completion, but later + // request snapshots often replay only the encrypted blob. The blob is the + // stable model-visible identity; readable text/summary is extra evidence + // that must agree whenever both sides provide it. + let Some(left_encoded) = reasoning_encoded_part(left) else { + return false; + }; + let Some(right_encoded) = reasoning_encoded_part(right) else { + return false; + }; + + left_encoded == right_encoded && readable_reasoning_parts_match(left, right) +} + +fn merge_reasoning_body( + existing: &mut ConversationBody, + incoming: &ConversationBody, +) -> Result<()> { + if conversation_body_matches(existing, incoming) { + return Ok(()); + } + if !reasoning_body_matches(existing, incoming) { + bail!("reasoning encrypted_content was reused with different readable content"); + } + if readable_reasoning_parts(existing).is_empty() + && !readable_reasoning_parts(incoming).is_empty() + { + existing.parts = incoming.parts.clone(); + } + Ok(()) +} + +fn reasoning_encoded_part(body: &ConversationBody) -> Option<(&str, &str)> { + body.parts.iter().find_map(|part| { + if let ConversationPart::Encoded { label, value } = part { + Some((label.as_str(), value.as_str())) + } else { + None + } + }) +} + +fn readable_reasoning_parts_match(left: &ConversationBody, right: &ConversationBody) -> bool { + let left = readable_reasoning_parts(left); + let right = readable_reasoning_parts(right); + left.is_empty() || right.is_empty() || left == right +} + +fn readable_reasoning_parts(body: &ConversationBody) -> Vec<&ConversationPart> { + body.parts + .iter() + .filter(|part| { + matches!( + part, + ConversationPart::Text { .. } | ConversationPart::Summary { .. } + ) + }) + .collect() +} + +#[cfg(test)] +#[path = "conversation_tests.rs"] +mod tests; diff --git a/codex-rs/rollout-trace/src/reducer/conversation/normalize.rs b/codex-rs/rollout-trace/src/reducer/conversation/normalize.rs new file mode 100644 index 000000000000..0d4ba18dc5c4 --- /dev/null +++ b/codex-rs/rollout-trace/src/reducer/conversation/normalize.rs @@ -0,0 +1,446 @@ +//! Normalization from Responses-shaped JSON items into conversation item data. + +use anyhow::Result; +use anyhow::bail; +use serde_json::Value; + +use crate::model::ConversationBody; +use crate::model::ConversationChannel; +use crate::model::ConversationItemKind; +use crate::model::ConversationPart; +use crate::model::ConversationRole; +use crate::model::TokenUsage; +use crate::payload::RawPayloadRef; + +/// Conversation fields parsed from one Responses item before trace identity. +/// +/// IDs and provenance are assigned after positional reconciliation. Keeping the +/// normalized data separate from `ConversationItem` makes reuse vs insertion a +/// single reducer decision instead of something the parser has to know about. +#[derive(Clone)] +pub(super) struct NormalizedConversationItem { + pub(super) role: ConversationRole, + pub(super) channel: Option, + pub(super) kind: ConversationItemKind, + pub(super) body: ConversationBody, + pub(super) call_id: Option, +} + +pub(super) fn normalize_model_items( + items: &[Value], + raw_payload: &RawPayloadRef, +) -> Result> { + let mut normalized_items = Vec::new(); + for item in items { + normalized_items.push(normalize_model_item(item, raw_payload)?); + } + Ok(normalized_items) +} + +pub(super) fn token_usage_from_value(value: &Value) -> Option { + Some(TokenUsage { + input_tokens: u64_field(value, "input_tokens")?, + cached_input_tokens: u64_field(value, "cached_input_tokens")?, + output_tokens: u64_field(value, "output_tokens")?, + reasoning_output_tokens: u64_field(value, "reasoning_output_tokens")?, + }) +} + +fn normalize_model_item( + item: &Value, + raw_payload: &RawPayloadRef, +) -> Result { + let Some(item_type) = item.get("type").and_then(Value::as_str) else { + bail!( + "model item in payload {} did not contain a string type", + raw_payload.raw_payload_id + ); + }; + match item_type { + "message" => normalize_message_item(item, raw_payload), + "reasoning" => normalize_reasoning_item(item, raw_payload), + "function_call" => Ok(NormalizedConversationItem { + role: ConversationRole::Assistant, + channel: Some(ConversationChannel::Commentary), + kind: ConversationItemKind::FunctionCall, + body: raw_text_or_json_body(item.get("arguments"), raw_payload), + call_id: item + .get("call_id") + .and_then(Value::as_str) + .map(ToString::to_string), + }), + "function_call_output" => Ok(NormalizedConversationItem { + role: ConversationRole::Tool, + channel: Some(ConversationChannel::Commentary), + kind: ConversationItemKind::FunctionCallOutput, + body: tool_output_body(item.get("output"), raw_payload), + call_id: item + .get("call_id") + .and_then(Value::as_str) + .map(ToString::to_string), + }), + "custom_tool_call" => Ok(NormalizedConversationItem { + role: ConversationRole::Assistant, + channel: Some(ConversationChannel::Commentary), + kind: ConversationItemKind::CustomToolCall, + body: custom_tool_call_body(item, raw_payload), + call_id: item + .get("call_id") + .and_then(Value::as_str) + .map(ToString::to_string), + }), + "custom_tool_call_output" => Ok(NormalizedConversationItem { + role: ConversationRole::Tool, + channel: Some(ConversationChannel::Commentary), + kind: ConversationItemKind::CustomToolCallOutput, + body: tool_output_body(item.get("output"), raw_payload), + call_id: item + .get("call_id") + .and_then(Value::as_str) + .map(ToString::to_string), + }), + "tool_search_call" | "web_search_call" | "image_generation_call" | "local_shell_call" => { + Ok(NormalizedConversationItem { + role: ConversationRole::Assistant, + channel: Some(ConversationChannel::Commentary), + kind: ConversationItemKind::FunctionCall, + body: json_body(item, raw_payload), + call_id: item + .get("call_id") + .and_then(Value::as_str) + .map(ToString::to_string), + }) + } + "tool_search_output" | "mcp_tool_call_output" => Ok(NormalizedConversationItem { + role: ConversationRole::Tool, + channel: Some(ConversationChannel::Commentary), + kind: ConversationItemKind::FunctionCallOutput, + body: json_body(item, raw_payload), + call_id: item + .get("call_id") + .and_then(Value::as_str) + .map(ToString::to_string), + }), + "compaction" | "compaction_summary" => Ok(NormalizedConversationItem { + role: ConversationRole::Assistant, + channel: Some(ConversationChannel::Summary), + kind: ConversationItemKind::Message, + body: compaction_body(item, raw_payload)?, + call_id: None, + }), + _ => bail!( + "unsupported model item type {item_type} in payload {}", + raw_payload.raw_payload_id + ), + } +} + +fn normalize_message_item( + item: &Value, + raw_payload: &RawPayloadRef, +) -> Result { + let Some(role) = item.get("role").and_then(Value::as_str) else { + bail!( + "message item in payload {} did not contain a string role", + raw_payload.raw_payload_id + ); + }; + let Some(role) = role_from_str(role) else { + bail!( + "unsupported message role {role} in payload {}", + raw_payload.raw_payload_id + ); + }; + Ok(NormalizedConversationItem { + role, + channel: item + .get("phase") + .and_then(Value::as_str) + .and_then(channel_from_phase), + kind: ConversationItemKind::Message, + body: ConversationBody { + parts: content_parts(item.get("content"), raw_payload), + }, + call_id: None, + }) +} + +fn normalize_reasoning_item( + item: &Value, + raw_payload: &RawPayloadRef, +) -> Result { + let mut parts = Vec::new(); + append_reasoning_parts( + item, + "content", + ReasoningPartKind::Content, + raw_payload, + &mut parts, + )?; + append_reasoning_parts( + item, + "summary", + ReasoningPartKind::Summary, + raw_payload, + &mut parts, + )?; + + if let Some(encrypted_content) = item.get("encrypted_content") { + let encrypted_content = match encrypted_content { + Value::Null => None, + Value::String(encrypted_content) => Some(encrypted_content), + _ => { + bail!( + "reasoning item in payload {} had non-string encrypted_content", + raw_payload.raw_payload_id + ); + } + }; + if let Some(encrypted_content) = encrypted_content { + parts.push(ConversationPart::Encoded { + label: "encrypted_content".to_string(), + value: encrypted_content.to_string(), + }); + } + } + + if parts.is_empty() { + bail!( + "reasoning item in payload {} contained no content, summary, or encrypted_content", + raw_payload.raw_payload_id + ); + } + + Ok(NormalizedConversationItem { + role: ConversationRole::Assistant, + channel: Some(ConversationChannel::Analysis), + kind: ConversationItemKind::Reasoning, + body: ConversationBody { parts }, + call_id: None, + }) +} + +#[derive(Clone, Copy)] +enum ReasoningPartKind { + Content, + Summary, +} + +fn append_reasoning_parts( + item: &Value, + key: &str, + kind: ReasoningPartKind, + raw_payload: &RawPayloadRef, + parts: &mut Vec, +) -> Result<()> { + let Some(items) = item.get(key) else { + return Ok(()); + }; + if matches!((kind, items), (ReasoningPartKind::Content, Value::Null)) { + return Ok(()); + } + let Some(items) = items.as_array() else { + bail!( + "reasoning item in payload {} had non-array {key}", + raw_payload.raw_payload_id + ); + }; + + for content_item in items { + let Some(item_type) = content_item.get("type").and_then(Value::as_str) else { + bail!( + "reasoning item in payload {} had {key} entry without string type", + raw_payload.raw_payload_id + ); + }; + let expected_type = match kind { + ReasoningPartKind::Content => { + if !matches!(item_type, "reasoning_text" | "text") { + bail!( + "reasoning item in payload {} had unsupported content type {item_type}", + raw_payload.raw_payload_id + ); + } + "content" + } + ReasoningPartKind::Summary => { + if item_type != "summary_text" { + bail!( + "reasoning item in payload {} had unsupported summary type {item_type}", + raw_payload.raw_payload_id + ); + } + "summary" + } + }; + + let Some(text) = content_item.get("text").and_then(Value::as_str) else { + bail!( + "reasoning item in payload {} had {expected_type} entry without string text", + raw_payload.raw_payload_id + ); + }; + match kind { + ReasoningPartKind::Content => parts.push(ConversationPart::Text { + text: text.to_string(), + }), + ReasoningPartKind::Summary => parts.push(ConversationPart::Summary { + text: text.to_string(), + }), + } + } + + Ok(()) +} + +fn role_from_str(role: &str) -> Option { + match role { + "system" => Some(ConversationRole::System), + "developer" => Some(ConversationRole::Developer), + "user" => Some(ConversationRole::User), + "assistant" => Some(ConversationRole::Assistant), + "tool" => Some(ConversationRole::Tool), + _ => None, + } +} + +fn channel_from_phase(phase: &str) -> Option { + match phase { + "commentary" => Some(ConversationChannel::Commentary), + "final_answer" => Some(ConversationChannel::Final), + "summary" => Some(ConversationChannel::Summary), + _ => None, + } +} + +fn content_parts(content: Option<&Value>, raw_payload: &RawPayloadRef) -> Vec { + let Some(content) = content.and_then(Value::as_array) else { + return vec![payload_ref_part("content", raw_payload)]; + }; + + let mut parts = Vec::new(); + for part in content { + match part.get("type").and_then(Value::as_str) { + Some("input_text" | "output_text" | "text") => { + if let Some(text) = part.get("text").and_then(Value::as_str) { + parts.push(ConversationPart::Text { + text: text.to_string(), + }); + } + } + Some("input_image") => parts.push(payload_ref_part("input_image", raw_payload)), + Some(other) => parts.push(payload_ref_part(other, raw_payload)), + None => parts.push(payload_ref_part("content", raw_payload)), + } + } + + if parts.is_empty() { + parts.push(payload_ref_part("empty_content", raw_payload)); + } + parts +} + +fn custom_tool_call_body(item: &Value, raw_payload: &RawPayloadRef) -> ConversationBody { + let Some(input) = item.get("input").and_then(Value::as_str) else { + return json_body(item, raw_payload); + }; + if item.get("name").and_then(Value::as_str) == Some("exec") { + ConversationBody { + parts: vec![ConversationPart::Code { + language: "javascript".to_string(), + source: input.to_string(), + }], + } + } else { + ConversationBody { + parts: vec![ConversationPart::Text { + text: input.to_string(), + }], + } + } +} + +fn raw_text_or_json_body(value: Option<&Value>, raw_payload: &RawPayloadRef) -> ConversationBody { + match value { + Some(Value::String(text)) => { + if let Ok(json) = serde_json::from_str::(text) { + json_body(&json, raw_payload) + } else { + ConversationBody { + parts: vec![ConversationPart::Text { text: text.clone() }], + } + } + } + Some(value) => json_body(value, raw_payload), + None => ConversationBody { + parts: vec![payload_ref_part("payload", raw_payload)], + }, + } +} + +fn tool_output_body(output: Option<&Value>, raw_payload: &RawPayloadRef) -> ConversationBody { + match output { + Some(Value::String(text)) => ConversationBody { + parts: vec![ConversationPart::Text { text: text.clone() }], + }, + Some(Value::Array(_)) => ConversationBody { + parts: content_parts(output, raw_payload), + }, + Some(value) => json_body(value, raw_payload), + None => ConversationBody { + parts: vec![payload_ref_part("tool_output", raw_payload)], + }, + } +} + +fn compaction_body(item: &Value, raw_payload: &RawPayloadRef) -> Result { + let Some(encrypted_content) = item.get("encrypted_content").and_then(Value::as_str) else { + bail!( + "compaction item in payload {} did not contain string encrypted_content", + raw_payload.raw_payload_id + ); + }; + // `type: "compaction"` is the remote-compaction summary that later re-enters model requests. + // The structural "history was cut here" marker is inserted separately when the checkpoint is + // installed; payload refs are observation-local, so the encoded summary itself is identity. + Ok(ConversationBody { + parts: vec![ConversationPart::Encoded { + label: "encrypted_content".to_string(), + value: encrypted_content.to_string(), + }], + }) +} + +fn json_body(value: &Value, raw_payload: &RawPayloadRef) -> ConversationBody { + ConversationBody { + parts: vec![ConversationPart::Json { + summary: summarize_json(value), + raw_payload_id: raw_payload.raw_payload_id.clone(), + }], + } +} + +fn payload_ref_part(label: &str, raw_payload: &RawPayloadRef) -> ConversationPart { + ConversationPart::PayloadRef { + label: label.to_string(), + raw_payload_id: raw_payload.raw_payload_id.clone(), + } +} + +fn summarize_json(value: &Value) -> String { + const MAX_JSON_SUMMARY_LEN: usize = 240; + let mut summary = + serde_json::to_string(value).unwrap_or_else(|_| "".to_string()); + if summary.len() > MAX_JSON_SUMMARY_LEN { + summary.truncate(MAX_JSON_SUMMARY_LEN); + summary.push_str("..."); + } + summary +} + +fn u64_field(value: &Value, field: &str) -> Option { + value + .get(field) + .and_then(Value::as_i64) + .map(|value| value.max(0) as u64) +} diff --git a/codex-rs/rollout-trace/src/reducer/conversation_tests.rs b/codex-rs/rollout-trace/src/reducer/conversation_tests.rs new file mode 100644 index 000000000000..aa8aee5ffb32 --- /dev/null +++ b/codex-rs/rollout-trace/src/reducer/conversation_tests.rs @@ -0,0 +1,808 @@ +use pretty_assertions::assert_eq; +use serde_json::json; +use tempfile::TempDir; + +use crate::model::ConversationChannel; +use crate::model::ConversationItemKind; +use crate::model::ConversationPart; +use crate::model::ExecutionStatus; +use crate::model::ProducerRef; +use crate::model::ToolCallKind; +use crate::model::ToolCallSummary; +use crate::payload::RawPayloadKind; +use crate::raw_event::RawTraceEventPayload; +use crate::reducer::test_support::append_inference_completion; +use crate::reducer::test_support::append_inference_start; +use crate::reducer::test_support::create_started_writer; +use crate::reducer::test_support::expect_replay_error; +use crate::reducer::test_support::message; +use crate::reducer::test_support::start_turn; +use crate::reducer::test_support::trace_context; +use crate::replay_bundle; + +#[test] +fn request_snapshots_reuse_history_without_deduping_new_identical_items() -> anyhow::Result<()> { + let temp = TempDir::new()?; + let writer = create_started_writer(&temp)?; + start_turn(&writer, "turn-1")?; + + let first_request = writer.write_json_payload( + RawPayloadKind::InferenceRequest, + &json!({ + "input": [message("user", "ok")] + }), + )?; + append_inference_start(&writer, "inference-1", "turn-1", first_request)?; + start_turn(&writer, "turn-2")?; + + let second_request = writer.write_json_payload( + RawPayloadKind::InferenceRequest, + &json!({ + "input": [ + message("user", "ok"), + message("assistant", "ack"), + message("user", "ok") + ] + }), + )?; + append_inference_start(&writer, "inference-2", "turn-2", second_request)?; + + let rollout = replay_bundle(temp.path())?; + let first = &rollout.inference_calls["inference-1"].request_item_ids; + let second = &rollout.inference_calls["inference-2"].request_item_ids; + + assert_eq!(first.len(), 1); + assert_eq!(second.len(), 3); + assert_eq!(second[0], first[0]); + assert_ne!(second[2], first[0]); + assert_eq!(rollout.conversation_items.len(), 3); + assert_eq!( + rollout.threads["thread-root"].conversation_item_ids, + *second + ); + + Ok(()) +} + +#[test] +fn response_outputs_enter_thread_conversation_on_completion() -> anyhow::Result<()> { + let temp = TempDir::new()?; + let writer = create_started_writer(&temp)?; + start_turn(&writer, "turn-1")?; + + let request = writer.write_json_payload( + RawPayloadKind::InferenceRequest, + &json!({ + "input": [message("user", "run tests")] + }), + )?; + append_inference_start(&writer, "inference-1", "turn-1", request)?; + + let response = writer.write_json_payload( + RawPayloadKind::InferenceResponse, + &json!({ + "response_id": "resp-1", + "output_items": [ + { + "type": "message", + "role": "assistant", + "content": [{"type": "output_text", "text": "tests passed"}] + } + ] + }), + )?; + append_inference_completion(&writer, "inference-1", "resp-1", response)?; + + let rollout = replay_bundle(temp.path())?; + let inference = &rollout.inference_calls["inference-1"]; + let mut expected_thread_items = inference.request_item_ids.clone(); + expected_thread_items.extend(inference.response_item_ids.clone()); + + assert_eq!(inference.response_item_ids.len(), 1); + assert_eq!( + rollout.threads["thread-root"].conversation_item_ids, + expected_thread_items, + ); + + Ok(()) +} + +#[test] +fn later_full_request_reuses_prior_json_tool_call_by_position() -> anyhow::Result<()> { + let temp = TempDir::new()?; + let writer = create_started_writer(&temp)?; + start_turn(&writer, "turn-1")?; + + let request = writer.write_json_payload( + RawPayloadKind::InferenceRequest, + &json!({ + "input": [message("user", "run tests")] + }), + )?; + append_inference_start(&writer, "inference-1", "turn-1", request)?; + + let response = writer.write_json_payload( + RawPayloadKind::InferenceResponse, + &json!({ + "response_id": "resp-1", + "output_items": [{ + "type": "function_call", + "name": "shell", + "arguments": "{\"cmd\":\"cargo test\"}", + "call_id": "call-1" + }] + }), + )?; + append_inference_completion(&writer, "inference-1", "resp-1", response)?; + start_turn(&writer, "turn-2")?; + + let next_request = writer.write_json_payload( + RawPayloadKind::InferenceRequest, + &json!({ + "input": [ + message("user", "run tests"), + { + "type": "function_call", + "name": "shell", + "arguments": "{\"cmd\":\"cargo test\"}", + "call_id": "call-1" + } + ] + }), + )?; + append_inference_start(&writer, "inference-2", "turn-2", next_request)?; + + let rollout = replay_bundle(temp.path())?; + let first = &rollout.inference_calls["inference-1"]; + let second = &rollout.inference_calls["inference-2"]; + + assert_eq!( + second.request_item_ids, + vec![ + first.request_item_ids[0].clone(), + first.response_item_ids[0].clone(), + ], + ); + assert_eq!(rollout.conversation_items.len(), 2); + + Ok(()) +} + +#[test] +fn incremental_request_carries_prior_request_and_response_items_forward() -> anyhow::Result<()> { + let temp = TempDir::new()?; + let writer = create_started_writer(&temp)?; + start_turn(&writer, "turn-1")?; + + let request = writer.write_json_payload( + RawPayloadKind::InferenceRequest, + &json!({ + "input": [message("user", "run tests")] + }), + )?; + append_inference_start(&writer, "inference-1", "turn-1", request)?; + + let response = writer.write_json_payload( + RawPayloadKind::InferenceResponse, + &json!({ + "response_id": "resp-1", + "token_usage": { + "input_tokens": 10, + "cached_input_tokens": 1, + "output_tokens": 5, + "reasoning_output_tokens": 2, + "total_tokens": 15 + }, + "output_items": [ + { + "type": "function_call", + "name": "shell", + "arguments": "{\"cmd\":\"cargo test\"}", + "call_id": "call-1" + } + ] + }), + )?; + append_inference_completion(&writer, "inference-1", "resp-1", response)?; + start_turn(&writer, "turn-2")?; + + let incremental_request = writer.write_json_payload( + RawPayloadKind::InferenceRequest, + &json!({ + "type": "response.create", + "previous_response_id": "resp-1", + "input": [ + { + "type": "function_call_output", + "call_id": "call-1", + "output": "tests passed" + } + ] + }), + )?; + append_inference_start(&writer, "inference-2", "turn-2", incremental_request)?; + + let rollout = replay_bundle(temp.path())?; + let first = &rollout.inference_calls["inference-1"]; + let second = &rollout.inference_calls["inference-2"]; + + assert_eq!(first.response_item_ids.len(), 1); + assert_eq!( + second.request_item_ids, + vec![ + first.request_item_ids[0].clone(), + first.response_item_ids[0].clone(), + rollout.threads["thread-root"].conversation_item_ids[2].clone(), + ], + ); + assert_eq!( + rollout.threads["thread-root"].conversation_item_ids, + second.request_item_ids, + ); + assert_eq!( + first.usage.as_ref().map(|usage| usage.input_tokens), + Some(10), + ); + + Ok(()) +} + +#[test] +fn full_request_snapshot_can_reorder_existing_items_and_insert_summary() -> anyhow::Result<()> { + let temp = TempDir::new()?; + let writer = create_started_writer(&temp)?; + start_turn(&writer, "turn-1")?; + + let request = writer.write_json_payload( + RawPayloadKind::InferenceRequest, + &json!({ + "input": [ + message("developer", "follow the repo rules"), + message("user", "count files") + ] + }), + )?; + append_inference_start(&writer, "inference-1", "turn-1", request)?; + start_turn(&writer, "turn-2")?; + + let compacted_request = writer.write_json_payload( + RawPayloadKind::InferenceRequest, + &json!({ + "input": [ + message("user", "count files"), + message("user", "summary from a compacted prior attempt"), + message("developer", "follow the repo rules") + ] + }), + )?; + append_inference_start(&writer, "inference-2", "turn-2", compacted_request)?; + + let rollout = replay_bundle(temp.path())?; + let first = &rollout.inference_calls["inference-1"].request_item_ids; + let second = &rollout.inference_calls["inference-2"].request_item_ids; + + assert_eq!(second[0], first[1]); + assert_eq!(second[2], first[0]); + assert_ne!(second[1], first[0]); + assert_ne!(second[1], first[1]); + assert_eq!(rollout.conversation_items.len(), 3); + + Ok(()) +} + +#[test] +fn reasoning_body_preserves_text_summary_and_encoded_content() -> anyhow::Result<()> { + let temp = TempDir::new()?; + let writer = create_started_writer(&temp)?; + start_turn(&writer, "turn-1")?; + + let request = writer.write_json_payload( + RawPayloadKind::InferenceRequest, + &json!({ + "input": [message("user", "think visibly")] + }), + )?; + append_inference_start(&writer, "inference-1", "turn-1", request)?; + + let response = writer.write_json_payload( + RawPayloadKind::InferenceResponse, + &json!({ + "response_id": "resp-1", + "output_items": [{ + "type": "reasoning", + "content": [{"type": "reasoning_text", "text": "raw reasoning"}], + "summary": [{"type": "summary_text", "text": "brief summary"}], + "encrypted_content": "encoded-reasoning" + }] + }), + )?; + append_inference_completion(&writer, "inference-1", "resp-1", response)?; + + let rollout = replay_bundle(temp.path())?; + let reasoning_item_id = &rollout.inference_calls["inference-1"].response_item_ids[0]; + + assert_eq!( + rollout.conversation_items[reasoning_item_id].body.parts, + vec![ + ConversationPart::Text { + text: "raw reasoning".to_string(), + }, + ConversationPart::Summary { + text: "brief summary".to_string(), + }, + ConversationPart::Encoded { + label: "encrypted_content".to_string(), + value: "encoded-reasoning".to_string(), + }, + ], + ); + + Ok(()) +} + +#[test] +fn encrypted_reasoning_reuses_response_item_in_later_request() -> anyhow::Result<()> { + let temp = TempDir::new()?; + let writer = create_started_writer(&temp)?; + start_turn(&writer, "turn-1")?; + + let user = message("user", "count files"); + let function_call = json!({ + "type": "function_call", + "name": "shell", + "arguments": "{\"cmd\":\"find . -maxdepth 1 -type f | wc -l\"}", + "call_id": "call-1" + }); + let encrypted_reasoning = json!({ + "type": "reasoning", + "summary": [], + "encrypted_content": "encoded-reasoning" + }); + let readable_reasoning = json!({ + "type": "reasoning", + "content": [{"type": "text", "text": "need count"}], + "summary": [], + "encrypted_content": "encoded-reasoning" + }); + + let request = writer.write_json_payload( + RawPayloadKind::InferenceRequest, + &json!({ + "input": [user] + }), + )?; + append_inference_start(&writer, "inference-1", "turn-1", request)?; + + let response = writer.write_json_payload( + RawPayloadKind::InferenceResponse, + &json!({ + "response_id": "resp-1", + "output_items": [ + readable_reasoning, + function_call + ] + }), + )?; + append_inference_completion(&writer, "inference-1", "resp-1", response)?; + start_turn(&writer, "turn-2")?; + + let followup = writer.write_json_payload( + RawPayloadKind::InferenceRequest, + &json!({ + "input": [ + user, + encrypted_reasoning, + function_call, + { + "type": "function_call_output", + "call_id": "call-1", + "output": "31\n" + } + ] + }), + )?; + append_inference_start(&writer, "inference-2", "turn-2", followup)?; + + let rollout = replay_bundle(temp.path())?; + let first = &rollout.inference_calls["inference-1"]; + let second = &rollout.inference_calls["inference-2"]; + let output_item_id = rollout.threads["thread-root"].conversation_item_ids[3].clone(); + + assert_eq!( + second.request_item_ids, + vec![ + first.request_item_ids[0].clone(), + first.response_item_ids[0].clone(), + first.response_item_ids[1].clone(), + output_item_id, + ], + ); + assert_eq!( + rollout.conversation_items[&first.response_item_ids[0]] + .body + .parts, + vec![ + ConversationPart::Text { + text: "need count".to_string(), + }, + ConversationPart::Encoded { + label: "encrypted_content".to_string(), + value: "encoded-reasoning".to_string(), + }, + ], + ); + assert_eq!(rollout.conversation_items.len(), 4); + assert_eq!( + rollout.threads["thread-root"].conversation_item_ids, + second.request_item_ids, + ); + + Ok(()) +} + +#[test] +fn same_encrypted_reasoning_with_different_text_is_reducer_error() -> anyhow::Result<()> { + let temp = TempDir::new()?; + let writer = create_started_writer(&temp)?; + start_turn(&writer, "turn-1")?; + + let user = message("user", "count files"); + let request = writer.write_json_payload( + RawPayloadKind::InferenceRequest, + &json!({ + "input": [user] + }), + )?; + append_inference_start(&writer, "inference-1", "turn-1", request)?; + + let response = writer.write_json_payload( + RawPayloadKind::InferenceResponse, + &json!({ + "response_id": "resp-1", + "output_items": [{ + "type": "reasoning", + "content": [{"type": "text", "text": "first text"}], + "summary": [], + "encrypted_content": "encoded-reasoning" + }] + }), + )?; + append_inference_completion(&writer, "inference-1", "resp-1", response)?; + start_turn(&writer, "turn-2")?; + + let conflicting_request = writer.write_json_payload( + RawPayloadKind::InferenceRequest, + &json!({ + "input": [ + user, + { + "type": "reasoning", + "content": [{"type": "text", "text": "different text"}], + "summary": [], + "encrypted_content": "encoded-reasoning" + } + ] + }), + )?; + append_inference_start(&writer, "inference-2", "turn-2", conflicting_request)?; + + expect_replay_error( + &temp, + "reasoning encrypted_content was reused with different readable content", + ) +} + +#[test] +fn model_visible_call_id_reuse_with_different_content_is_reducer_error() -> anyhow::Result<()> { + let temp = TempDir::new()?; + let writer = create_started_writer(&temp)?; + start_turn(&writer, "turn-1")?; + + let request = writer.write_json_payload( + RawPayloadKind::InferenceRequest, + &json!({ + "input": [{ + "type": "function_call", + "name": "shell", + "arguments": "{\"cmd\":\"cargo test\"}", + "call_id": "call-1" + }] + }), + )?; + append_inference_start(&writer, "inference-1", "turn-1", request)?; + start_turn(&writer, "turn-2")?; + + let conflicting_request = writer.write_json_payload( + RawPayloadKind::InferenceRequest, + &json!({ + "input": [{ + "type": "function_call", + "name": "shell", + "arguments": "{\"cmd\":\"cargo check\"}", + "call_id": "call-1" + }] + }), + )?; + append_inference_start(&writer, "inference-2", "turn-2", conflicting_request)?; + + expect_replay_error( + &temp, + "model-visible call id call-1 was reused with different content", + ) +} + +#[test] +fn unsupported_model_item_is_reducer_error() -> anyhow::Result<()> { + let temp = TempDir::new()?; + let writer = create_started_writer(&temp)?; + start_turn(&writer, "turn-1")?; + + let request = writer.write_json_payload( + RawPayloadKind::InferenceRequest, + &json!({ + "input": [ + { + "type": "new_unhandled_model_item", + "payload": "must not be silently skipped" + } + ] + }), + )?; + append_inference_start(&writer, "inference-1", "turn-1", request)?; + + expect_replay_error( + &temp, + "unsupported model item type new_unhandled_model_item", + ) +} + +#[test] +fn missing_request_input_is_reducer_error() -> anyhow::Result<()> { + let temp = TempDir::new()?; + let writer = create_started_writer(&temp)?; + start_turn(&writer, "turn-1")?; + + let request = writer.write_json_payload( + RawPayloadKind::InferenceRequest, + &json!({ + "model": "gpt-test" + }), + )?; + append_inference_start(&writer, "inference-1", "turn-1", request)?; + + expect_replay_error(&temp, "did not contain input") +} + +#[test] +fn unknown_previous_response_id_is_reducer_error() -> anyhow::Result<()> { + let temp = TempDir::new()?; + let writer = create_started_writer(&temp)?; + start_turn(&writer, "turn-1")?; + + let request = writer.write_json_payload( + RawPayloadKind::InferenceRequest, + &json!({ + "previous_response_id": "resp-missing", + "input": [message("user", "still here")] + }), + )?; + append_inference_start(&writer, "inference-1", "turn-1", request)?; + + expect_replay_error(&temp, "unknown previous_response_id resp-missing") +} + +#[test] +fn compaction_boundary_repeats_prefix_and_reuses_replacement_items() -> anyhow::Result<()> { + let temp = TempDir::new()?; + let writer = create_started_writer(&temp)?; + start_turn(&writer, "turn-1")?; + + let developer = message("developer", "follow repo rules"); + let user = message("user", "count files"); + let request = writer.write_json_payload( + RawPayloadKind::InferenceRequest, + &json!({ + "input": [developer, user] + }), + )?; + append_inference_start(&writer, "inference-1", "turn-1", request)?; + + let summary = message("user", "summary from compacted history"); + let compaction_summary = json!({ + "type": "compaction", + "encrypted_content": "encrypted-summary", + }); + let checkpoint = writer.write_json_payload( + RawPayloadKind::CompactionCheckpoint, + &json!({ + "input_history": [developer, user], + "replacement_history": [user, summary, compaction_summary] + }), + )?; + writer.append_with_context( + trace_context("turn-1"), + RawTraceEventPayload::CompactionInstalled { + compaction_id: "compaction-1".to_string(), + checkpoint_payload: checkpoint, + }, + )?; + + start_turn(&writer, "turn-2")?; + let post_compaction_request = writer.write_json_payload( + RawPayloadKind::InferenceRequest, + &json!({ + "input": [developer, user, summary, compaction_summary] + }), + )?; + append_inference_start(&writer, "inference-2", "turn-2", post_compaction_request)?; + + let rollout = replay_bundle(temp.path())?; + let first = &rollout.inference_calls["inference-1"]; + let second = &rollout.inference_calls["inference-2"]; + let compaction = &rollout.compactions["compaction-1"]; + + assert_eq!(compaction.input_item_ids, first.request_item_ids); + assert_eq!(second.request_item_ids.len(), 4); + assert_eq!( + &second.request_item_ids[1..], + compaction.replacement_item_ids.as_slice() + ); + let marker = &rollout.conversation_items[&compaction.marker_item_id]; + assert_eq!(marker.kind, ConversationItemKind::CompactionMarker); + assert_eq!(marker.body.parts, Vec::::new()); + assert_eq!( + marker.produced_by, + vec![ProducerRef::Compaction { + compaction_id: "compaction-1".to_string() + }], + ); + assert_ne!(second.request_item_ids[0], first.request_item_ids[0]); + assert_ne!( + compaction.replacement_item_ids[0], + first.request_item_ids[1] + ); + assert_eq!( + rollout.conversation_items[&compaction.replacement_item_ids[0]].produced_by, + vec![ProducerRef::Compaction { + compaction_id: "compaction-1".to_string() + }], + ); + assert_eq!( + rollout.conversation_items[&compaction.replacement_item_ids[1]].produced_by, + vec![ProducerRef::Compaction { + compaction_id: "compaction-1".to_string() + }], + ); + assert_eq!( + rollout.conversation_items[&compaction.replacement_item_ids[2]].channel, + Some(ConversationChannel::Summary), + ); + assert_eq!( + rollout.conversation_items[&compaction.replacement_item_ids[2]].kind, + ConversationItemKind::Message, + ); + assert_eq!( + rollout.conversation_items[&compaction.replacement_item_ids[2]] + .body + .parts, + vec![ConversationPart::Encoded { + label: "encrypted_content".to_string(), + value: "encrypted-summary".to_string(), + }], + ); + + Ok(()) +} + +#[test] +fn tool_call_links_model_call_and_followup_output_items() -> anyhow::Result<()> { + let temp = TempDir::new()?; + let writer = create_started_writer(&temp)?; + start_turn(&writer, "turn-1")?; + + let request = writer.write_json_payload( + RawPayloadKind::InferenceRequest, + &json!({ + "input": [message("user", "run tests")] + }), + )?; + append_inference_start(&writer, "inference-1", "turn-1", request)?; + + let response = writer.write_json_payload( + RawPayloadKind::InferenceResponse, + &json!({ + "response_id": "resp-1", + "output_items": [{ + "type": "function_call", + "name": "exec_command", + "arguments": "{\"cmd\":\"cargo test\"}", + "call_id": "call-1" + }] + }), + )?; + append_inference_completion(&writer, "inference-1", "resp-1", response)?; + writer.append_with_context( + trace_context("turn-1"), + RawTraceEventPayload::ToolCallStarted { + tool_call_id: "tool-1".to_string(), + model_visible_call_id: Some("call-1".to_string()), + code_mode_runtime_tool_id: None, + requester: crate::raw_event::RawToolCallRequester::Model, + kind: ToolCallKind::ExecCommand, + summary: ToolCallSummary::Generic { + label: "exec_command".to_string(), + input_preview: Some("cargo test".to_string()), + output_preview: None, + }, + invocation_payload: None, + }, + )?; + writer.append_with_context( + trace_context("turn-1"), + RawTraceEventPayload::ToolCallEnded { + tool_call_id: "tool-1".to_string(), + status: ExecutionStatus::Completed, + result_payload: None, + }, + )?; + + start_turn(&writer, "turn-2")?; + let followup = writer.write_json_payload( + RawPayloadKind::InferenceRequest, + &json!({ + "previous_response_id": "resp-1", + "input": [{ + "type": "function_call_output", + "call_id": "call-1", + "output": "tests passed" + }] + }), + )?; + append_inference_start(&writer, "inference-2", "turn-2", followup)?; + + let rollout = replay_bundle(temp.path())?; + let first_inference = &rollout.inference_calls["inference-1"]; + let second_inference = &rollout.inference_calls["inference-2"]; + let tool_call = &rollout.tool_calls["tool-1"]; + let output_item_id = second_inference + .request_item_ids + .last() + .expect("follow-up output item"); + + assert_eq!( + first_inference.tool_call_ids_started_by_response, + vec!["tool-1".to_string()], + ); + assert_eq!( + tool_call.model_visible_call_item_ids, + first_inference.response_item_ids, + ); + assert_eq!( + tool_call.model_visible_output_item_ids, + vec![output_item_id.clone()], + ); + assert_eq!( + rollout.conversation_items[output_item_id].produced_by, + vec![ProducerRef::Tool { + tool_call_id: "tool-1".to_string(), + }], + ); + + Ok(()) +} + +#[test] +fn inference_start_rejects_unknown_codex_turn() -> anyhow::Result<()> { + let temp = TempDir::new()?; + let writer = create_started_writer(&temp)?; + + let request = writer.write_json_payload( + RawPayloadKind::InferenceRequest, + &json!({ + "input": [message("user", "hello")] + }), + )?; + append_inference_start(&writer, "inference-1", "turn-missing", request)?; + + expect_replay_error(&temp, "referenced unknown codex turn turn-missing") +} diff --git a/codex-rs/rollout-trace/src/reducer/inference.rs b/codex-rs/rollout-trace/src/reducer/inference.rs new file mode 100644 index 000000000000..ddd08142ff7c --- /dev/null +++ b/codex-rs/rollout-trace/src/reducer/inference.rs @@ -0,0 +1,143 @@ +//! Inference call lifecycle reduction. +//! +//! Conversation request/response normalization lives in the conversation module; +//! this module owns the runtime envelope around those model-facing payloads. + +use anyhow::Result; +use anyhow::bail; + +use super::TraceReducer; +use crate::model::ExecutionStatus; +use crate::model::ExecutionWindow; +use crate::model::InferenceCall; +use crate::model::InferenceCallId; +use crate::payload::RawPayloadRef; +use crate::raw_event::RawEventSeq; + +/// Raw inference-start fields after dispatch has stripped the common event envelope. +/// +/// Keeping this as one argument prevents callsites from passing a long list of +/// adjacent strings whose ordering is easy to mix up. +pub(super) struct StartedInferenceCall { + pub(super) inference_call_id: InferenceCallId, + pub(super) thread_id: String, + pub(super) codex_turn_id: String, + pub(super) model: String, + pub(super) provider_name: String, + pub(super) request_payload: RawPayloadRef, +} + +impl TraceReducer { + /// Starts an inference call and reduces its request payload into conversation items. + /// + /// Requests are model-visible transcript evidence, so the inference object is only + /// inserted after the request snapshot has been normalized and linked to the turn. + pub(super) fn start_inference_call( + &mut self, + seq: RawEventSeq, + wall_time_unix_ms: i64, + started: StartedInferenceCall, + ) -> Result<()> { + if self + .rollout + .inference_calls + .contains_key(&started.inference_call_id) + { + bail!( + "duplicate inference start for {}", + started.inference_call_id + ); + } + + let inference_call_id = started.inference_call_id.clone(); + let thread_id = started.thread_id.clone(); + let codex_turn_id = started.codex_turn_id.clone(); + let request_payload = started.request_payload.clone(); + let Some(turn) = self.rollout.codex_turns.get(&codex_turn_id) else { + bail!( + "inference start {inference_call_id} referenced unknown codex turn {codex_turn_id}" + ); + }; + if turn.thread_id != thread_id { + bail!( + "inference start {inference_call_id} used thread {thread_id}, \ + but codex turn {codex_turn_id} belongs to {}", + turn.thread_id + ); + } + + let request_item_ids = self.reduce_inference_request( + wall_time_unix_ms, + &inference_call_id, + &thread_id, + &codex_turn_id, + &request_payload, + )?; + + self.thread_mut(&thread_id)?; + + self.rollout.inference_calls.insert( + inference_call_id.clone(), + InferenceCall { + inference_call_id, + thread_id, + codex_turn_id, + execution: ExecutionWindow { + started_at_unix_ms: wall_time_unix_ms, + started_seq: seq, + ended_at_unix_ms: None, + ended_seq: None, + status: ExecutionStatus::Running, + }, + model: started.model, + provider_name: started.provider_name, + upstream_request_id: None, + request_item_ids, + response_item_ids: Vec::new(), + tool_call_ids_started_by_response: Vec::new(), + usage: None, + raw_request_payload_id: started.request_payload.raw_payload_id, + raw_response_payload_id: None, + }, + ); + Ok(()) + } + + /// Completes an inference call and, when present, reduces response output items. + pub(super) fn complete_inference_call( + &mut self, + seq: RawEventSeq, + wall_time_unix_ms: i64, + inference_call_id: InferenceCallId, + status: ExecutionStatus, + response_id: Option, + response_payload: Option, + ) -> Result<()> { + if !self + .rollout + .inference_calls + .contains_key(&inference_call_id) + { + bail!("inference completion referenced unknown call {inference_call_id}"); + } + + let response_item_ids = response_payload + .as_ref() + .map(|payload| { + self.reduce_inference_response(wall_time_unix_ms, &inference_call_id, payload) + }) + .transpose()?; + let Some(inference) = self.rollout.inference_calls.get_mut(&inference_call_id) else { + bail!("inference call {inference_call_id} disappeared during response reduction"); + }; + inference.execution.ended_at_unix_ms = Some(wall_time_unix_ms); + inference.execution.ended_seq = Some(seq); + inference.execution.status = status; + inference.upstream_request_id = response_id; + inference.raw_response_payload_id = response_payload.map(|payload| payload.raw_payload_id); + if let Some(response_item_ids) = response_item_ids { + inference.response_item_ids = response_item_ids; + } + Ok(()) + } +} diff --git a/codex-rs/rollout-trace/src/reducer/mod.rs b/codex-rs/rollout-trace/src/reducer/mod.rs new file mode 100644 index 000000000000..e4f6b837f455 --- /dev/null +++ b/codex-rs/rollout-trace/src/reducer/mod.rs @@ -0,0 +1,504 @@ +//! Deterministic replay from raw trace events to `RolloutTrace`. + +use std::collections::BTreeMap; +use std::fs::File; +use std::io::BufRead; +use std::io::BufReader; +use std::path::Path; +use std::path::PathBuf; + +use anyhow::Context; +use anyhow::Result; +use anyhow::bail; +use serde_json::Value; + +use crate::bundle::MANIFEST_FILE_NAME; +use crate::bundle::RAW_EVENT_LOG_FILE_NAME; +use crate::bundle::REDUCED_TRACE_SCHEMA_VERSION; +use crate::bundle::TraceBundleManifest; +use crate::model::ExecutionStatus; +use crate::model::RolloutTrace; +use crate::payload::RawPayloadRef; +use crate::raw_event::RawTraceEvent; +use crate::raw_event::RawTraceEventPayload; + +mod code_cell; +mod compaction; +mod conversation; +mod inference; +#[cfg(test)] +pub(crate) mod test_support; +mod thread; +mod tool; + +use self::code_cell::PendingCodeCellLifecycleEvent; +use self::code_cell::PendingCodeCellStart; +use self::code_cell::StartedCodeCell; +use self::compaction::StartedCompactionRequest; +use self::inference::StartedInferenceCall; +use self::tool::ObservedAgentResultEdge; +use self::tool::PendingAgentInteractionEdge; +use self::tool::ToolCallStarted; + +/// Replays a local trace bundle into a reduced rollout graph. +pub fn replay_bundle(bundle_dir: impl AsRef) -> Result { + let bundle_dir = bundle_dir.as_ref(); + let manifest: TraceBundleManifest = + serde_json::from_reader(File::open(bundle_dir.join(MANIFEST_FILE_NAME))?) + .with_context(|| format!("read {}", bundle_dir.join(MANIFEST_FILE_NAME).display()))?; + let mut reducer = TraceReducer { + rollout: RolloutTrace::new( + REDUCED_TRACE_SCHEMA_VERSION, + manifest.trace_id, + manifest.rollout_id, + manifest.root_thread_id, + manifest.started_at_unix_ms, + ), + bundle_dir: bundle_dir.to_path_buf(), + next_conversation_item_ordinal: 1, + next_terminal_operation_ordinal: 1, + thread_conversation_snapshots: BTreeMap::new(), + pending_compaction_replacement_item_ids: BTreeMap::new(), + code_cell_ids_by_runtime: BTreeMap::new(), + pending_code_cell_starts: BTreeMap::new(), + pending_code_cell_lifecycle_events: BTreeMap::new(), + pending_agent_interaction_edges: Vec::new(), + }; + + let event_log_path = bundle_dir.join(RAW_EVENT_LOG_FILE_NAME); + let event_log = File::open(&event_log_path) + .with_context(|| format!("open trace event log {}", event_log_path.display()))?; + for (line_index, line) in BufReader::new(event_log).lines().enumerate() { + let line = line.with_context(|| format!("read trace event line {}", line_index + 1))?; + if line.trim().is_empty() { + continue; + } + let event: RawTraceEvent = serde_json::from_str(&line) + .with_context(|| format!("parse trace event line {}", line_index + 1))?; + reducer.apply_event(event)?; + } + // Spawn edges prefer the child task message as their target, but a child can + // fail before that message is ever reduced. Only after replaying the whole + // bundle do we know which spawn deliveries need the child-thread fallback. + reducer.resolve_pending_spawn_edge_fallbacks()?; + + Ok(reducer.rollout) +} + +struct TraceReducer { + rollout: RolloutTrace, + bundle_dir: PathBuf, + next_conversation_item_ordinal: u64, + next_terminal_operation_ordinal: u64, + /// Last model-visible conversation snapshot per thread. + /// + /// Requests and responses both advance this sequence because both are + /// model-facing payloads. Repeated request snapshots reuse item IDs only + /// when the same normalized item appears at the same position; identical + /// content at a new position must remain a distinct conversation item. + thread_conversation_snapshots: BTreeMap>, + /// Replacement snapshot installed by compaction but not yet seen in a sampling request. + /// + /// The first full request after compaction should compare against the installed replacement + /// history, not against the pre-compaction request. That keeps repeated prefix/context messages + /// as fresh post-compaction conversation items while still reusing the summary/replacement + /// items that actually became live history. + pending_compaction_replacement_item_ids: BTreeMap>, + /// Runtime cell ids indexed by thread-local code-mode handle. + /// + /// Reduced `CodeCellId`s are based on the model-visible `exec` call id + /// because that is the durable source identity. Runtime lifecycle, nested + /// tools, and `wait` calls arrive with the runtime-local `cell_id`, so this + /// index is the one intentional bridge between those namespaces. + code_cell_ids_by_runtime: BTreeMap<(String, String), String>, + /// Code-cell starts whose model-visible `custom_tool_call` item has not + /// been reduced yet. + /// + /// Core begins executing tools before the stream-completion hook records + /// the response payload that requested them. Queueing keeps replay strict + /// about eventual source-item ownership without requiring trace producers + /// to reorder runtime events behind inference completion. + pending_code_cell_starts: BTreeMap, + /// Initial/end events that arrived while the matching start was queued. + /// + /// Fast cells can return before the inference response payload that proves + /// the model-visible `exec` source item has been reduced. The start remains + /// queued for ownership validation; these lifecycle events wait with it and + /// are replayed in raw sequence order once the cell materializes. + pending_code_cell_lifecycle_events: BTreeMap>, + /// Multi-agent deliveries whose recipient-side transcript item has not been observed yet. + /// + /// V2 agent tools enqueue mailbox messages in the target thread. The trace event for the + /// sending tool arrives before the recipient inference request materializes that mailbox item + /// as a `ConversationItem`, so the reducer keeps the delivery edge pending until it can point + /// at the exact model-visible item instead of a coarse thread. + pending_agent_interaction_edges: Vec, +} + +impl TraceReducer { + fn read_payload_json(&self, payload: &RawPayloadRef) -> Result { + // Reducers keep raw bodies out of the graph, but typed replay sometimes + // needs a small subset of fields to build semantic objects. + let payload_path = self.bundle_dir.join(&payload.path); + let file = File::open(&payload_path) + .with_context(|| format!("open payload {}", payload.raw_payload_id))?; + serde_json::from_reader(file) + .with_context(|| format!("parse payload {}", payload.raw_payload_id)) + } + + fn apply_event(&mut self, event: RawTraceEvent) -> Result<()> { + // Raw payload refs are reducer-wide evidence, not owned by a single + // semantic arm. Keep this bookkeeping separate so typed reduction can + // stay strict without duplicating payload insertion in every case. + for payload in event.payload.raw_payload_refs() { + self.insert_raw_payload(payload); + } + + match event.payload { + RawTraceEventPayload::RolloutStarted { + trace_id, + root_thread_id, + } => { + self.rollout.trace_id = trace_id; + self.rollout.root_thread_id = root_thread_id; + } + RawTraceEventPayload::RolloutEnded { status } => { + self.rollout.status = status; + self.rollout.ended_at_unix_ms = Some(event.wall_time_unix_ms); + } + RawTraceEventPayload::ThreadStarted { + thread_id, + agent_path, + metadata_payload, + } => { + self.start_thread( + event.seq, + event.wall_time_unix_ms, + thread_id, + agent_path, + metadata_payload, + )?; + } + RawTraceEventPayload::ThreadEnded { thread_id, status } => { + self.end_thread(event.seq, event.wall_time_unix_ms, thread_id, status)?; + } + RawTraceEventPayload::CodexTurnStarted { + codex_turn_id, + thread_id, + } => { + self.start_codex_turn( + event.seq, + event.wall_time_unix_ms, + codex_turn_id, + thread_id, + )?; + } + RawTraceEventPayload::CodexTurnEnded { + codex_turn_id, + status, + } => { + self.end_codex_turn( + event.seq, + event.wall_time_unix_ms, + event.thread_id, + codex_turn_id, + status, + )?; + } + RawTraceEventPayload::InferenceStarted { + inference_call_id, + thread_id, + codex_turn_id, + model, + provider_name, + request_payload, + } => { + self.start_inference_call( + event.seq, + event.wall_time_unix_ms, + StartedInferenceCall { + inference_call_id, + thread_id, + codex_turn_id, + model, + provider_name, + request_payload, + }, + )?; + } + RawTraceEventPayload::InferenceCompleted { + inference_call_id, + response_id, + response_payload, + } => { + self.complete_inference_call( + event.seq, + event.wall_time_unix_ms, + inference_call_id, + ExecutionStatus::Completed, + response_id, + Some(response_payload), + )?; + } + RawTraceEventPayload::InferenceFailed { + inference_call_id, + partial_response_payload, + .. + } => { + self.complete_inference_call( + event.seq, + event.wall_time_unix_ms, + inference_call_id, + ExecutionStatus::Failed, + /*response_id*/ None, + partial_response_payload, + )?; + } + RawTraceEventPayload::ProtocolEventObserved { .. } => { + // Protocol wrappers are raw debug breadcrumbs. Typed hooks own + // the reduced graph, so these payload refs are retained without + // creating semantic objects. + } + RawTraceEventPayload::ToolCallStarted { + tool_call_id, + model_visible_call_id, + code_mode_runtime_tool_id, + requester, + kind, + summary, + invocation_payload, + } => { + self.start_tool_call( + event.seq, + event.wall_time_unix_ms, + event.thread_id, + event.codex_turn_id, + ToolCallStarted { + tool_call_id, + model_visible_call_id, + code_mode_runtime_tool_id, + requester, + kind, + summary, + invocation_payload, + }, + )?; + } + RawTraceEventPayload::ToolCallRuntimeStarted { + tool_call_id, + runtime_payload, + } => { + self.start_tool_runtime_observation( + event.seq, + event.wall_time_unix_ms, + tool_call_id, + runtime_payload, + )?; + } + RawTraceEventPayload::ToolCallRuntimeEnded { + tool_call_id, + status, + runtime_payload, + } => { + self.end_tool_runtime_observation( + event.seq, + event.wall_time_unix_ms, + tool_call_id, + status, + runtime_payload, + )?; + } + RawTraceEventPayload::ToolCallEnded { + tool_call_id, + status, + result_payload, + } => { + self.end_tool_call( + event.seq, + event.wall_time_unix_ms, + tool_call_id, + status, + result_payload, + )?; + } + RawTraceEventPayload::CodeCellStarted { + runtime_cell_id, + model_visible_call_id, + source_js, + } => { + let thread_id = self.code_cell_event_thread_id( + event.thread_id, + event.codex_turn_id.as_deref(), + &runtime_cell_id, + "code cell start", + )?; + let reduced_code_cell_id = + self.reduced_code_cell_id_for_model_visible_call(&model_visible_call_id); + self.record_runtime_code_cell_id( + &thread_id, + &runtime_cell_id, + &reduced_code_cell_id, + )?; + self.start_or_queue_code_cell(PendingCodeCellStart { + seq: event.seq, + wall_time_unix_ms: event.wall_time_unix_ms, + thread_id, + codex_turn_id: event.codex_turn_id, + started: StartedCodeCell { + code_cell_id: reduced_code_cell_id, + runtime_cell_id, + model_visible_call_id, + source_js, + }, + })?; + } + RawTraceEventPayload::CodeCellInitialResponse { + runtime_cell_id, + status, + .. + } => { + let thread_id = self.code_cell_event_thread_id( + event.thread_id, + event.codex_turn_id.as_deref(), + &runtime_cell_id, + "code cell initial response", + )?; + let code_cell_id = self.code_cell_id_for_runtime_cell_id( + &thread_id, + &runtime_cell_id, + "code cell initial response", + )?; + self.record_or_queue_code_cell_initial_response( + event.seq, + event.wall_time_unix_ms, + code_cell_id, + runtime_cell_id, + status, + )?; + } + RawTraceEventPayload::CodeCellEnded { + runtime_cell_id, + status, + .. + } => { + let thread_id = self.code_cell_event_thread_id( + event.thread_id, + event.codex_turn_id.as_deref(), + &runtime_cell_id, + "code cell end", + )?; + let code_cell_id = self.code_cell_id_for_runtime_cell_id( + &thread_id, + &runtime_cell_id, + "code cell end", + )?; + self.end_or_queue_code_cell( + event.seq, + event.wall_time_unix_ms, + code_cell_id, + status, + )?; + } + RawTraceEventPayload::CompactionRequestStarted { + compaction_id, + compaction_request_id, + thread_id, + codex_turn_id, + model, + provider_name, + request_payload, + } => { + self.start_compaction_request( + event.seq, + event.wall_time_unix_ms, + StartedCompactionRequest { + compaction_id, + compaction_request_id, + thread_id, + codex_turn_id, + model, + provider_name, + request_payload, + }, + )?; + } + RawTraceEventPayload::CompactionRequestCompleted { + compaction_id, + compaction_request_id, + response_payload, + } => { + self.complete_compaction_request( + event.seq, + event.wall_time_unix_ms, + compaction_id, + compaction_request_id, + ExecutionStatus::Completed, + Some(response_payload), + )?; + } + RawTraceEventPayload::CompactionRequestFailed { + compaction_id, + compaction_request_id, + .. + } => { + self.complete_compaction_request( + event.seq, + event.wall_time_unix_ms, + compaction_id, + compaction_request_id, + ExecutionStatus::Failed, + /*response_payload*/ None, + )?; + } + RawTraceEventPayload::CompactionInstalled { + compaction_id, + checkpoint_payload, + } => { + let Some(thread_id) = event.thread_id else { + bail!("compaction installed event {compaction_id} did not include a thread id"); + }; + let Some(codex_turn_id) = event.codex_turn_id else { + bail!( + "compaction installed event {compaction_id} did not include a codex turn id" + ); + }; + self.reduce_compaction_installed_event( + event.wall_time_unix_ms, + thread_id, + codex_turn_id, + compaction_id, + checkpoint_payload, + )?; + } + RawTraceEventPayload::AgentResultObserved { + edge_id, + child_thread_id, + child_codex_turn_id, + parent_thread_id, + message, + carried_payload, + } => { + self.queue_agent_result_interaction_edge(ObservedAgentResultEdge { + wall_time_unix_ms: event.wall_time_unix_ms, + edge_id, + child_thread_id, + child_codex_turn_id, + parent_thread_id, + message, + carried_payload, + })?; + } + RawTraceEventPayload::Other { .. } => { + bail!("raw trace event has no reducer implementation"); + } + } + + Ok(()) + } + + fn insert_raw_payload(&mut self, payload: &RawPayloadRef) { + self.rollout + .raw_payloads + .insert(payload.raw_payload_id.clone(), payload.clone()); + } +} diff --git a/codex-rs/rollout-trace/src/reducer/test_support.rs b/codex-rs/rollout-trace/src/reducer/test_support.rs new file mode 100644 index 000000000000..eae93eff6449 --- /dev/null +++ b/codex-rs/rollout-trace/src/reducer/test_support.rs @@ -0,0 +1,200 @@ +//! Shared reducer test fixtures. +//! +//! These helpers only write common trace scaffolding. Scenario-specific event +//! sequences stay in each test so the behavior under test remains visible. + +use serde_json::json; +use tempfile::TempDir; + +use crate::model::ToolCallSummary; +use crate::payload::RawPayloadKind; +use crate::payload::RawPayloadRef; +use crate::raw_event::RawTraceEventContext; +use crate::raw_event::RawTraceEventPayload; +use crate::replay_bundle; +use crate::writer::TraceWriter; + +pub(crate) const ROOT_THREAD_ID: &str = "thread-root"; +pub(crate) const AGENT_ROOT_THREAD_ID: &str = "019d0000-0000-7000-8000-000000000001"; + +pub(crate) fn message(role: &str, text: &str) -> serde_json::Value { + json!({ + "type": "message", + "role": role, + "content": [{"type": "input_text", "text": text}] + }) +} + +pub(crate) fn generic_summary(label: &str) -> ToolCallSummary { + ToolCallSummary::Generic { + label: label.to_string(), + input_preview: None, + output_preview: None, + } +} + +pub(crate) fn create_started_writer(temp: &TempDir) -> anyhow::Result { + create_started_writer_for_thread(temp, ROOT_THREAD_ID, "/root") +} + +pub(crate) fn create_started_agent_writer(temp: &TempDir) -> anyhow::Result { + create_started_writer_for_thread(temp, AGENT_ROOT_THREAD_ID, "/root") +} + +pub(crate) fn create_started_writer_for_thread( + temp: &TempDir, + thread_id: &str, + agent_path: &str, +) -> anyhow::Result { + let writer = TraceWriter::create( + temp.path(), + "trace-1".to_string(), + "rollout-1".to_string(), + thread_id.to_string(), + )?; + start_thread(&writer, thread_id, agent_path)?; + Ok(writer) +} + +pub(crate) fn start_thread( + writer: &TraceWriter, + thread_id: &str, + agent_path: &str, +) -> anyhow::Result<()> { + writer.append(RawTraceEventPayload::ThreadStarted { + thread_id: thread_id.to_string(), + agent_path: agent_path.to_string(), + metadata_payload: None, + })?; + Ok(()) +} + +pub(crate) fn start_turn(writer: &TraceWriter, turn_id: &str) -> anyhow::Result<()> { + start_turn_for_thread(writer, ROOT_THREAD_ID, turn_id) +} + +pub(crate) fn start_agent_turn(writer: &TraceWriter, turn_id: &str) -> anyhow::Result<()> { + start_turn_for_thread(writer, AGENT_ROOT_THREAD_ID, turn_id) +} + +pub(crate) fn start_turn_for_thread( + writer: &TraceWriter, + thread_id: &str, + turn_id: &str, +) -> anyhow::Result<()> { + writer.append(RawTraceEventPayload::CodexTurnStarted { + codex_turn_id: turn_id.to_string(), + thread_id: thread_id.to_string(), + })?; + Ok(()) +} + +pub(crate) fn trace_context(turn_id: &str) -> RawTraceEventContext { + trace_context_for_thread(ROOT_THREAD_ID, turn_id) +} + +pub(crate) fn trace_context_for_agent(turn_id: &str) -> RawTraceEventContext { + trace_context_for_thread(AGENT_ROOT_THREAD_ID, turn_id) +} + +pub(crate) fn trace_context_for_thread(thread_id: &str, turn_id: &str) -> RawTraceEventContext { + RawTraceEventContext { + thread_id: Some(thread_id.to_string()), + codex_turn_id: Some(turn_id.to_string()), + } +} + +pub(crate) fn append_inference_start( + writer: &TraceWriter, + inference_call_id: &str, + codex_turn_id: &str, + request_payload: RawPayloadRef, +) -> anyhow::Result<()> { + append_inference_start_for_thread( + writer, + ROOT_THREAD_ID, + codex_turn_id, + inference_call_id, + request_payload, + ) +} + +pub(crate) fn append_inference_start_for_thread( + writer: &TraceWriter, + thread_id: &str, + codex_turn_id: &str, + inference_call_id: &str, + request_payload: RawPayloadRef, +) -> anyhow::Result<()> { + writer.append(RawTraceEventPayload::InferenceStarted { + inference_call_id: inference_call_id.to_string(), + thread_id: thread_id.to_string(), + codex_turn_id: codex_turn_id.to_string(), + model: "gpt-test".to_string(), + provider_name: "test-provider".to_string(), + request_payload, + })?; + Ok(()) +} + +pub(crate) fn append_inference_completion( + writer: &TraceWriter, + inference_call_id: &str, + response_id: &str, + response_payload: RawPayloadRef, +) -> anyhow::Result<()> { + writer.append(RawTraceEventPayload::InferenceCompleted { + inference_call_id: inference_call_id.to_string(), + response_id: Some(response_id.to_string()), + response_payload, + })?; + Ok(()) +} + +pub(crate) fn append_inference_request( + writer: &TraceWriter, + thread_id: &str, + turn_id: &str, + inference_id: &str, + input: Vec, +) -> anyhow::Result<()> { + let request = + writer.write_json_payload(RawPayloadKind::InferenceRequest, &json!({ "input": input }))?; + append_inference_start_for_thread(writer, thread_id, turn_id, inference_id, request) +} + +pub(crate) fn append_completed_inference( + writer: &TraceWriter, + thread_id: &str, + turn_id: &str, + inference_id: &str, + input: Vec, + output_items: Vec, +) -> anyhow::Result<()> { + append_inference_request(writer, thread_id, turn_id, inference_id, input)?; + let response = writer.write_json_payload( + RawPayloadKind::InferenceResponse, + &json!({ + "response_id": format!("resp-{inference_id}"), + "output_items": output_items, + }), + )?; + writer.append_with_context( + trace_context_for_thread(thread_id, turn_id), + RawTraceEventPayload::InferenceCompleted { + inference_call_id: inference_id.to_string(), + response_id: Some(format!("resp-{inference_id}")), + response_payload: response, + }, + )?; + Ok(()) +} + +pub(crate) fn expect_replay_error(temp: &TempDir, expected: &str) -> anyhow::Result<()> { + let Err(err) = replay_bundle(temp.path()) else { + panic!("expected replay error containing {expected}"); + }; + let message = err.to_string(); + assert!(message.contains(expected), "unexpected error: {message}"); + Ok(()) +} diff --git a/codex-rs/rollout-trace/src/reducer/thread.rs b/codex-rs/rollout-trace/src/reducer/thread.rs new file mode 100644 index 000000000000..6f24f694701d --- /dev/null +++ b/codex-rs/rollout-trace/src/reducer/thread.rs @@ -0,0 +1,264 @@ +//! Thread and turn reduction. +//! +//! Threads are the container that every other reducer module links into. This +//! module owns the identity metadata parsing as well, so the central dispatcher +//! does not need to know the shape of multi-agent session-source payloads. + +use anyhow::Context; +use anyhow::Result; +use anyhow::bail; +use serde::Deserialize; +use serde_json::Value; + +use super::TraceReducer; +use super::tool::spawn_edge_id; +use crate::model::AgentOrigin; +use crate::model::AgentThread; +use crate::model::CodexTurn; +use crate::model::CodexTurnId; +use crate::model::ExecutionStatus; +use crate::model::ExecutionWindow; +use crate::model::RolloutStatus; +use crate::payload::RawPayloadRef; +use crate::raw_event::RawEventSeq; + +impl TraceReducer { + /// Inserts a thread and derives its multi-agent identity from optional metadata. + /// + /// The raw event carries a denormalized agent path; when v2 subagent metadata is + /// present, that metadata is authoritative because it also drives spawn edges and task names. + pub(super) fn start_thread( + &mut self, + seq: RawEventSeq, + wall_time_unix_ms: i64, + thread_id: String, + agent_path: String, + metadata_payload: Option, + ) -> Result<()> { + if self.rollout.threads.contains_key(&thread_id) { + bail!("duplicate thread start for {thread_id}"); + } + + let metadata = metadata_payload + .as_ref() + .map(|payload| self.thread_started_metadata(payload)) + .transpose()?; + let spawn = metadata + .as_ref() + .and_then(ThreadStartedMetadata::thread_spawn); + // The v2 SessionSource is the authoritative child identity record. + // Prefer its nested agent_path over the denormalized event field so + // task derivation and the spawn edge are based on the same metadata. + let agent_path = spawn + .as_ref() + .and_then(|spawn| spawn.agent_path.clone()) + .or_else(|| { + metadata + .as_ref() + .and_then(|metadata| metadata.agent_path.clone()) + }) + .unwrap_or(agent_path); + let nickname = metadata + .as_ref() + .and_then(|metadata| metadata.nickname.clone()); + let default_model = metadata + .as_ref() + .and_then(|metadata| metadata.model.clone()); + let origin = if let Some(spawn) = spawn { + let edge_id = spawn_edge_id(&spawn.parent_thread_id, &thread_id); + let task_name = spawn + .task_name + .clone() + .unwrap_or_else(|| task_name_from_agent_path(&agent_path)); + let agent_role = spawn.agent_role.clone().unwrap_or_default(); + + AgentOrigin::Spawned { + parent_thread_id: spawn.parent_thread_id, + spawn_edge_id: edge_id, + task_name, + agent_role, + } + } else { + AgentOrigin::Root + }; + + self.rollout.threads.insert( + thread_id.clone(), + AgentThread { + thread_id, + agent_path, + nickname, + origin, + execution: ExecutionWindow { + started_at_unix_ms: wall_time_unix_ms, + started_seq: seq, + ended_at_unix_ms: None, + ended_seq: None, + status: ExecutionStatus::Running, + }, + default_model, + conversation_item_ids: Vec::new(), + }, + ); + Ok(()) + } + + /// Marks a thread terminal without treating child shutdown as rollout completion. + pub(super) fn end_thread( + &mut self, + seq: RawEventSeq, + wall_time_unix_ms: i64, + thread_id: String, + status: RolloutStatus, + ) -> Result<()> { + let thread = self.thread_mut(&thread_id)?; + thread.execution.ended_at_unix_ms = Some(wall_time_unix_ms); + thread.execution.ended_seq = Some(seq); + thread.execution.status = match status { + RolloutStatus::Running => ExecutionStatus::Running, + RolloutStatus::Completed => ExecutionStatus::Completed, + RolloutStatus::Failed => ExecutionStatus::Failed, + RolloutStatus::Aborted => ExecutionStatus::Aborted, + }; + Ok(()) + } + + /// Starts a Codex turn inside an existing thread. + pub(super) fn start_codex_turn( + &mut self, + seq: RawEventSeq, + wall_time_unix_ms: i64, + codex_turn_id: CodexTurnId, + thread_id: String, + ) -> Result<()> { + if self.rollout.codex_turns.contains_key(&codex_turn_id) { + bail!("duplicate codex turn start for {codex_turn_id}"); + } + + self.thread_mut(&thread_id)?; + + self.rollout.codex_turns.insert( + codex_turn_id.clone(), + CodexTurn { + codex_turn_id, + thread_id, + execution: ExecutionWindow { + started_at_unix_ms: wall_time_unix_ms, + started_seq: seq, + ended_at_unix_ms: None, + ended_seq: None, + status: ExecutionStatus::Running, + }, + input_item_ids: Vec::new(), + }, + ); + Ok(()) + } + + /// Marks a Codex turn terminal and validates any thread id carried by the raw event. + pub(super) fn end_codex_turn( + &mut self, + seq: RawEventSeq, + wall_time_unix_ms: i64, + thread_id: Option, + codex_turn_id: CodexTurnId, + status: ExecutionStatus, + ) -> Result<()> { + if let Some(event_thread_id) = thread_id.as_deref() + && let Some(turn) = self.rollout.codex_turns.get(&codex_turn_id) + && turn.thread_id != event_thread_id + { + bail!( + "codex turn end for {codex_turn_id} used thread {event_thread_id}, \ + but the turn belongs to {}", + turn.thread_id + ); + } + + let Some(turn) = self.rollout.codex_turns.get_mut(&codex_turn_id) else { + bail!("codex turn end referenced unknown turn {codex_turn_id}"); + }; + turn.execution.ended_at_unix_ms = Some(wall_time_unix_ms); + turn.execution.ended_seq = Some(seq); + turn.execution.status = status.clone(); + self.terminate_running_code_cells_for_turn_end( + seq, + wall_time_unix_ms, + &codex_turn_id, + &status, + )?; + Ok(()) + } + + /// Returns a mutable thread or reports a reducer error tied to the unknown id. + pub(super) fn thread_mut(&mut self, thread_id: &str) -> Result<&mut AgentThread> { + self.rollout + .threads + .get_mut(thread_id) + .with_context(|| format!("trace event referenced unknown thread {thread_id}")) + } + + fn thread_started_metadata( + &self, + metadata_payload: &RawPayloadRef, + ) -> Result { + let value = self.read_payload_json(metadata_payload)?; + serde_json::from_value(value) + .with_context(|| format!("parse thread metadata {}", metadata_payload.raw_payload_id)) + } +} + +#[derive(Deserialize)] +struct ThreadStartedMetadata { + agent_path: Option, + task_name: Option, + nickname: Option, + agent_role: Option, + model: Option, + session_source: Option, +} + +impl ThreadStartedMetadata { + fn thread_spawn(&self) -> Option { + let spawn = self + .session_source + .as_ref()? + .get("subagent")? + .get("thread_spawn")?; + let agent_path = spawn + .get("agent_path") + .and_then(Value::as_str) + .map(str::to_string) + .or_else(|| self.agent_path.clone()); + Some(ThreadSpawnMetadata { + parent_thread_id: spawn.get("parent_thread_id")?.as_str()?.to_string(), + agent_path: agent_path.clone(), + task_name: spawn + .get("task_name") + .and_then(Value::as_str) + .map(str::to_string) + .or_else(|| self.task_name.clone()) + .or_else(|| agent_path.as_deref().map(task_name_from_agent_path)), + agent_role: spawn + .get("agent_role") + .and_then(Value::as_str) + .map(str::to_string) + .or_else(|| self.agent_role.clone()), + }) + } +} + +struct ThreadSpawnMetadata { + parent_thread_id: String, + agent_path: Option, + task_name: Option, + agent_role: Option, +} + +fn task_name_from_agent_path(agent_path: &str) -> String { + agent_path + .rsplit('/') + .find(|segment| !segment.is_empty()) + .unwrap_or(agent_path) + .to_string() +} diff --git a/codex-rs/rollout-trace/src/reducer/tool.rs b/codex-rs/rollout-trace/src/reducer/tool.rs new file mode 100644 index 000000000000..d481bb6acdfb --- /dev/null +++ b/codex-rs/rollout-trace/src/reducer/tool.rs @@ -0,0 +1,500 @@ +use anyhow::Context; +use anyhow::Result; +use anyhow::bail; + +use super::TraceReducer; +use crate::model::CodeModeRuntimeToolId; +use crate::model::ConversationItemKind; +use crate::model::ExecutionStatus; +use crate::model::ExecutionWindow; +use crate::model::ModelVisibleCallId; +use crate::model::ProducerRef; +use crate::model::ToolCall; +use crate::model::ToolCallId; +use crate::model::ToolCallKind; +use crate::model::ToolCallSummary; +use crate::payload::RawPayloadRef; +use crate::raw_event::RawEventSeq; +use crate::raw_event::RawToolCallRequester; + +mod agents; +mod terminal; + +pub(super) use agents::ObservedAgentResultEdge; +pub(super) use agents::PendingAgentInteractionEdge; +pub(super) use agents::spawn_edge_id; + +/// Raw tool-start fields after dispatch has stripped the common event envelope. +/// +/// Tool starts carry several optional identity namespaces: model-visible calls, +/// code-mode runtime tools, and canonical invocation payloads. Grouping them keeps +/// the reducer callsite readable and avoids positional argument mistakes. +pub(super) struct ToolCallStarted { + pub(super) tool_call_id: ToolCallId, + pub(super) model_visible_call_id: Option, + pub(super) code_mode_runtime_tool_id: Option, + pub(super) requester: RawToolCallRequester, + pub(super) kind: ToolCallKind, + pub(super) summary: ToolCallSummary, + pub(super) invocation_payload: Option, +} + +impl TraceReducer { + /// Starts a tool call and links it to model-visible items or runtime parents when available. + /// + /// Some tools also create richer domain objects, such as terminal operations, from + /// the same invocation payload. The generic ToolCall remains the common index. + pub(super) fn start_tool_call( + &mut self, + seq: RawEventSeq, + wall_time_unix_ms: i64, + thread_id: Option, + codex_turn_id: Option, + started: ToolCallStarted, + ) -> Result<()> { + let tool_call_id = started.tool_call_id.clone(); + if self.rollout.tool_calls.contains_key(&tool_call_id) { + bail!("duplicate tool call start for {tool_call_id}"); + } + self.ensure_unique_model_visible_tool_call( + started.model_visible_call_id.as_deref(), + &tool_call_id, + )?; + + let thread_id = self.tool_thread_id(thread_id, codex_turn_id.as_deref())?; + self.validate_tool_turn(&thread_id, codex_turn_id.as_deref())?; + + let model_visible_call_id = started.model_visible_call_id.clone(); + let requester = self.reduce_tool_call_requester(&thread_id, started.requester.clone())?; + let model_visible_call_item_ids = model_visible_call_id + .as_deref() + .map(|call_id| { + self.model_visible_tool_item_ids( + &thread_id, + call_id, + &[ + ConversationItemKind::FunctionCall, + ConversationItemKind::CustomToolCall, + ], + ) + }) + .unwrap_or_default(); + let model_visible_output_item_ids = model_visible_call_id + .as_deref() + .map(|call_id| { + self.model_visible_tool_item_ids( + &thread_id, + call_id, + &[ + ConversationItemKind::FunctionCallOutput, + ConversationItemKind::CustomToolCallOutput, + ], + ) + }) + .unwrap_or_default(); + + self.thread_mut(&thread_id)?; + + // Some terminal-like tools, notably write_stdin, do not emit a richer + // runtime begin event. For those tools the canonical invocation is the + // only place to recover the terminal/session join key. + let terminal_operation_id = self.start_terminal_operation_from_invocation( + seq, + wall_time_unix_ms, + &thread_id, + &tool_call_id, + &started.kind, + started.invocation_payload.as_ref(), + )?; + // Terminal-backed tools should render through the richer terminal + // operation instead of the generic tool summary captured by producers. + let summary = terminal_operation_id + .as_ref() + .map(|operation_id| ToolCallSummary::Terminal { + operation_id: operation_id.clone(), + }) + .unwrap_or(started.summary); + let raw_invocation_payload_id = started + .invocation_payload + .as_ref() + .map(|payload| payload.raw_payload_id.clone()); + self.link_wait_tool_call_from_request_payload( + &thread_id, + &tool_call_id, + started.invocation_payload.as_ref(), + )?; + + self.rollout.tool_calls.insert( + tool_call_id.clone(), + ToolCall { + tool_call_id: tool_call_id.clone(), + model_visible_call_id, + code_mode_runtime_tool_id: started.code_mode_runtime_tool_id, + thread_id, + started_by_codex_turn_id: codex_turn_id, + execution: ExecutionWindow { + started_at_unix_ms: wall_time_unix_ms, + started_seq: seq, + ended_at_unix_ms: None, + ended_seq: None, + status: ExecutionStatus::Running, + }, + requester: requester.clone(), + kind: started.kind, + model_visible_call_item_ids, + model_visible_output_item_ids: Vec::new(), + terminal_operation_id, + summary, + raw_invocation_payload_id, + raw_result_payload_id: None, + raw_runtime_payload_ids: Vec::new(), + }, + ); + + self.link_tool_call_to_code_cell(&tool_call_id, &requester)?; + self.link_tool_to_inference_response(&tool_call_id); + // Output items need the reverse ProducerRef edge as well, so attach + // them after insertion through the same helper used by the transcript + // reducer when the output is observed after the tool start. + for item_id in model_visible_output_item_ids { + self.add_tool_output_item(&tool_call_id, &item_id)?; + } + // The call/output items may have been observed before this tool start. + // Re-sync after insertion so terminal observations get both directions + // of the model-visible link. + self.sync_terminal_model_observation(&tool_call_id)?; + Ok(()) + } + + /// Completes the canonical tool call and any terminal operation driven by dispatch output. + /// + /// Protocol-backed terminal tools end from runtime events; direct tools + /// may only have the canonical result payload, so this method handles both paths. + pub(super) fn end_tool_call( + &mut self, + seq: RawEventSeq, + wall_time_unix_ms: i64, + tool_call_id: ToolCallId, + status: ExecutionStatus, + result_payload: Option, + ) -> Result<()> { + let (terminal_operation_id, thread_id, end_terminal_from_result) = { + let Some(tool_call) = self.rollout.tool_calls.get_mut(&tool_call_id) else { + bail!("tool call end referenced unknown call {tool_call_id}"); + }; + tool_call.execution.ended_at_unix_ms = Some(wall_time_unix_ms); + tool_call.execution.ended_seq = Some(seq); + tool_call.execution.status = status.clone(); + tool_call.raw_result_payload_id = result_payload + .as_ref() + .map(|payload| payload.raw_payload_id.clone()); + ( + tool_call.terminal_operation_id.clone(), + tool_call.thread_id.clone(), + // Protocol-backed tools end terminal operations from + // runtime observations. Dispatch result payloads are still kept + // on ToolCall, but they are caller-facing and may be transformed + // relative to the raw terminal output. + tool_call.raw_runtime_payload_ids.is_empty(), + ) + }; + if end_terminal_from_result && let Some(operation_id) = terminal_operation_id { + self.end_terminal_operation( + seq, + wall_time_unix_ms, + &thread_id, + &operation_id, + status, + result_payload.as_ref(), + )?; + } + self.attach_agent_interaction_tool_result(&tool_call_id, result_payload.as_ref())?; + Ok(()) + } + + /// Records a runtime-begin observation for an already started tool call. + /// + /// Runtime observations enrich the generic tool with protocol facts and may + /// create domain-specific children such as terminal operations or agent edges. + pub(super) fn start_tool_runtime_observation( + &mut self, + seq: RawEventSeq, + wall_time_unix_ms: i64, + tool_call_id: ToolCallId, + runtime_payload: RawPayloadRef, + ) -> Result<()> { + let (thread_id, _requester, kind, existing_terminal_operation_id) = { + let Some(tool_call) = self.rollout.tool_calls.get_mut(&tool_call_id) else { + bail!("tool runtime start referenced unknown call {tool_call_id}"); + }; + push_unique( + &mut tool_call.raw_runtime_payload_ids, + &runtime_payload.raw_payload_id, + ); + ( + tool_call.thread_id.clone(), + tool_call.requester.clone(), + tool_call.kind.clone(), + tool_call.terminal_operation_id.clone(), + ) + }; + if existing_terminal_operation_id.is_some() + && matches!(kind, ToolCallKind::ExecCommand | ToolCallKind::WriteStdin) + { + bail!("tool runtime start would create a second terminal operation for {tool_call_id}"); + } + + // Protocol begin events carry runtime facts such as process ids and + // cwd. These facts should create terminal rows, but they must not + // replace the canonical invocation payload captured at dispatch. + let terminal_operation_id = self.start_terminal_operation_from_runtime( + seq, + wall_time_unix_ms, + &thread_id, + &tool_call_id, + &kind, + &runtime_payload, + )?; + + if let Some(operation_id) = &terminal_operation_id { + let Some(tool_call) = self.rollout.tool_calls.get_mut(&tool_call_id) else { + bail!("tool call {tool_call_id} disappeared during runtime start reduction"); + }; + if tool_call.terminal_operation_id.is_none() { + tool_call.terminal_operation_id = Some(operation_id.clone()); + tool_call.summary = ToolCallSummary::Terminal { + operation_id: operation_id.clone(), + }; + } + } + + if terminal_operation_id.is_some() { + self.sync_terminal_model_observation(&tool_call_id)?; + } + self.start_agent_interaction_from_runtime(&tool_call_id, &runtime_payload)?; + Ok(()) + } + + /// Records a runtime-end observation for an already started tool call. + pub(super) fn end_tool_runtime_observation( + &mut self, + seq: RawEventSeq, + wall_time_unix_ms: i64, + tool_call_id: ToolCallId, + status: ExecutionStatus, + runtime_payload: RawPayloadRef, + ) -> Result<()> { + let (thread_id, terminal_operation_id) = { + let Some(tool_call) = self.rollout.tool_calls.get_mut(&tool_call_id) else { + bail!("tool runtime end referenced unknown call {tool_call_id}"); + }; + push_unique( + &mut tool_call.raw_runtime_payload_ids, + &runtime_payload.raw_payload_id, + ); + ( + tool_call.thread_id.clone(), + tool_call.terminal_operation_id.clone(), + ) + }; + + if let Some(operation_id) = terminal_operation_id { + self.end_terminal_operation( + seq, + wall_time_unix_ms, + &thread_id, + &operation_id, + status, + Some(&runtime_payload), + )?; + } + self.end_agent_interaction_from_runtime( + wall_time_unix_ms, + &tool_call_id, + &runtime_payload, + )?; + Ok(()) + } + + /// Attaches a conversation item observed after the tool call was reduced. + /// + /// Inference request/response ordering can expose call/output items after the + /// runtime tool object exists, so transcript reduction calls back here to add + /// reverse links without duplicating matching logic. + pub(super) fn attach_model_visible_tool_item( + &mut self, + item_id: &str, + call_id: Option<&str>, + kind: &ConversationItemKind, + ) -> Result<()> { + let Some(call_id) = call_id else { + return Ok(()); + }; + match kind { + ConversationItemKind::FunctionCall | ConversationItemKind::CustomToolCall => { + if let Some(tool_call_id) = self.single_tool_for_model_visible_call(call_id)? { + self.add_tool_call_item(&tool_call_id, item_id)?; + self.link_tool_to_inference_response(&tool_call_id); + self.sync_terminal_model_observation(&tool_call_id)?; + } + } + ConversationItemKind::FunctionCallOutput + | ConversationItemKind::CustomToolCallOutput => { + if let Some(tool_call_id) = self.single_tool_for_model_visible_call(call_id)? { + self.add_tool_output_item(&tool_call_id, item_id)?; + self.sync_terminal_model_observation(&tool_call_id)?; + } + } + ConversationItemKind::Message + | ConversationItemKind::Reasoning + | ConversationItemKind::CompactionMarker => {} + } + Ok(()) + } + + fn tool_thread_id( + &self, + thread_id: Option, + codex_turn_id: Option<&str>, + ) -> Result { + if let Some(thread_id) = thread_id { + return Ok(thread_id); + } + let Some(codex_turn_id) = codex_turn_id else { + bail!("tool call start did not include thread or Codex turn context"); + }; + self.rollout + .codex_turns + .get(codex_turn_id) + .map(|turn| turn.thread_id.clone()) + .with_context(|| { + format!("tool call start referenced unknown Codex turn {codex_turn_id}") + }) + } + + fn validate_tool_turn(&self, thread_id: &str, codex_turn_id: Option<&str>) -> Result<()> { + if !self.rollout.threads.contains_key(thread_id) { + bail!("tool call start referenced unknown thread {thread_id}"); + } + if let Some(codex_turn_id) = codex_turn_id { + let Some(turn) = self.rollout.codex_turns.get(codex_turn_id) else { + bail!("tool call start referenced unknown Codex turn {codex_turn_id}"); + }; + if turn.thread_id != thread_id { + bail!( + "tool call start used thread {thread_id}, but Codex turn {codex_turn_id} \ + belongs to {}", + turn.thread_id + ); + } + } + Ok(()) + } + + fn ensure_unique_model_visible_tool_call( + &self, + model_visible_call_id: Option<&str>, + tool_call_id: &str, + ) -> Result<()> { + let Some(model_visible_call_id) = model_visible_call_id else { + return Ok(()); + }; + if let Some(existing) = self.single_tool_for_model_visible_call(model_visible_call_id)? + && existing != tool_call_id + { + bail!("duplicate tool call for model-visible call id {model_visible_call_id}"); + } + Ok(()) + } + + fn single_tool_for_model_visible_call( + &self, + model_visible_call_id: &str, + ) -> Result> { + let mut matching = self + .rollout + .tool_calls + .values() + .filter(|tool| tool.model_visible_call_id.as_deref() == Some(model_visible_call_id)) + .map(|tool| tool.tool_call_id.clone()); + let first = matching.next(); + if matching.next().is_some() { + bail!("multiple tool calls matched model-visible call id {model_visible_call_id}"); + } + Ok(first) + } + + fn model_visible_tool_item_ids( + &self, + thread_id: &str, + call_id: &str, + kinds: &[ConversationItemKind], + ) -> Vec { + self.rollout + .conversation_items + .values() + .filter(|item| { + item.thread_id == thread_id + && item.call_id.as_deref() == Some(call_id) + && kinds.contains(&item.kind) + }) + .map(|item| item.item_id.clone()) + .collect::>() + } + + fn add_tool_call_item(&mut self, tool_call_id: &str, item_id: &str) -> Result<()> { + let Some(tool_call) = self.rollout.tool_calls.get_mut(tool_call_id) else { + bail!("tool call {tool_call_id} disappeared during conversation linking"); + }; + push_unique(&mut tool_call.model_visible_call_item_ids, item_id); + Ok(()) + } + + fn add_tool_output_item(&mut self, tool_call_id: &str, item_id: &str) -> Result<()> { + let Some(tool_call) = self.rollout.tool_calls.get_mut(tool_call_id) else { + bail!("tool call {tool_call_id} disappeared during output linking"); + }; + push_unique(&mut tool_call.model_visible_output_item_ids, item_id); + + let Some(item) = self.rollout.conversation_items.get_mut(item_id) else { + bail!("conversation item {item_id} disappeared during output linking"); + }; + let producer = ProducerRef::Tool { + tool_call_id: tool_call_id.to_string(), + }; + if !item.produced_by.contains(&producer) { + item.produced_by.push(producer); + } + Ok(()) + } + + fn link_tool_to_inference_response(&mut self, tool_call_id: &str) { + let Some(tool_call) = self.rollout.tool_calls.get(tool_call_id) else { + return; + }; + let call_item_ids = tool_call.model_visible_call_item_ids.clone(); + if call_item_ids.is_empty() { + return; + } + for inference in self.rollout.inference_calls.values_mut() { + if inference + .response_item_ids + .iter() + .any(|item_id| call_item_ids.contains(item_id)) + && !inference + .tool_call_ids_started_by_response + .contains(&tool_call_id.to_string()) + { + inference + .tool_call_ids_started_by_response + .push(tool_call_id.to_string()); + } + } + } +} + +fn push_unique(items: &mut Vec, item_id: &str) { + if !items.iter().any(|existing| existing == item_id) { + items.push(item_id.to_string()); + } +} diff --git a/codex-rs/rollout-trace/src/reducer/tool/agents.rs b/codex-rs/rollout-trace/src/reducer/tool/agents.rs new file mode 100644 index 000000000000..a49b794d1470 --- /dev/null +++ b/codex-rs/rollout-trace/src/reducer/tool/agents.rs @@ -0,0 +1,621 @@ +use anyhow::Context; +use anyhow::Result; +use anyhow::bail; +use codex_protocol::protocol::CollabAgentInteractionBeginEvent; +use codex_protocol::protocol::CollabAgentInteractionEndEvent; +use codex_protocol::protocol::CollabAgentSpawnEndEvent; +use codex_protocol::protocol::CollabCloseBeginEvent; +use codex_protocol::protocol::CollabCloseEndEvent; +use codex_protocol::protocol::InterAgentCommunication; + +use super::super::TraceReducer; +use crate::model::ConversationItem; +use crate::model::ConversationItemKind; +use crate::model::ConversationPart; +use crate::model::ConversationRole; +use crate::model::InteractionEdge; +use crate::model::InteractionEdgeKind; +use crate::model::ToolCallKind; +use crate::model::TraceAnchor; +use crate::payload::RawPayloadRef; + +/// Agent delivery edge waiting for the recipient-side conversation item. +/// +/// Multi-agent v2 records the sender tool before the target thread necessarily +/// includes the delivered mailbox message in a model-visible request. The edge +/// stays pending so it can target that exact conversation item when possible. +pub(in crate::reducer) struct PendingAgentInteractionEdge { + pub(in crate::reducer) edge_id: String, + pub(in crate::reducer) kind: InteractionEdgeKind, + pub(in crate::reducer) source: TraceAnchor, + pub(in crate::reducer) target_thread_id: String, + pub(in crate::reducer) message_content: String, + /// Spawn-only fallback for children that fail before their task message is model-visible. + pub(in crate::reducer) unresolved_spawn_thread_id: Option, + pub(in crate::reducer) started_at_unix_ms: i64, + pub(in crate::reducer) ended_at_unix_ms: Option, + pub(in crate::reducer) carried_raw_payload_ids: Vec, +} + +/// Typed reducer input for a multi-agent v2 child completion notification. +/// +/// Child results are observed outside the normal tool lifecycle, but they still +/// carry a parent-thread notification. This wrapper keeps the dispatcher from +/// passing a positional bundle of thread and turn ids. +pub(in crate::reducer) struct ObservedAgentResultEdge { + pub(in crate::reducer) wall_time_unix_ms: i64, + pub(in crate::reducer) edge_id: String, + pub(in crate::reducer) child_thread_id: String, + pub(in crate::reducer) child_codex_turn_id: String, + pub(in crate::reducer) parent_thread_id: String, + pub(in crate::reducer) message: String, + pub(in crate::reducer) carried_payload: Option, +} + +/// Builds the stable edge id for the spawn relationship between two threads. +pub(in crate::reducer) fn spawn_edge_id(parent_thread_id: &str, child_thread_id: &str) -> String { + format!("edge:spawn:{parent_thread_id}:{child_thread_id}") +} + +impl TraceReducer { + /// Starts a multi-agent edge from a runtime begin payload, when the tool kind supports one. + pub(super) fn start_agent_interaction_from_runtime( + &mut self, + tool_call_id: &str, + runtime_payload: &RawPayloadRef, + ) -> Result<()> { + let kind = self + .rollout + .tool_calls + .get(tool_call_id) + .with_context(|| format!("agent edge referenced unknown tool call {tool_call_id}"))? + .kind + .clone(); + match kind { + ToolCallKind::AssignAgentTask => { + let payload: CollabAgentInteractionBeginEvent = + serde_json::from_value(self.read_payload_json(runtime_payload)?)?; + self.queue_message_agent_interaction( + tool_call_id, + InteractionEdgeKind::AssignAgentTask, + payload.receiver_thread_id.to_string(), + payload.prompt, + /*ended_at_unix_ms*/ None, + ) + } + ToolCallKind::SendMessage => { + let payload: CollabAgentInteractionBeginEvent = + serde_json::from_value(self.read_payload_json(runtime_payload)?)?; + self.queue_message_agent_interaction( + tool_call_id, + InteractionEdgeKind::SendMessage, + payload.receiver_thread_id.to_string(), + payload.prompt, + /*ended_at_unix_ms*/ None, + ) + } + ToolCallKind::CloseAgent => { + let payload: CollabCloseBeginEvent = + serde_json::from_value(self.read_payload_json(runtime_payload)?)?; + self.upsert_close_agent_interaction( + tool_call_id, + payload.receiver_thread_id.to_string(), + /*ended_at_unix_ms*/ None, + ) + } + ToolCallKind::ExecCommand + | ToolCallKind::WriteStdin + | ToolCallKind::ApplyPatch + | ToolCallKind::Mcp { .. } + | ToolCallKind::Web + | ToolCallKind::ImageGeneration + | ToolCallKind::SpawnAgent + | ToolCallKind::WaitAgent + | ToolCallKind::Other { .. } => Ok(()), + } + } + + /// Ends or enriches a multi-agent edge from a runtime end payload. + pub(super) fn end_agent_interaction_from_runtime( + &mut self, + wall_time_unix_ms: i64, + tool_call_id: &str, + runtime_payload: &RawPayloadRef, + ) -> Result<()> { + let kind = self.rollout.tool_calls[tool_call_id].kind.clone(); + match kind { + ToolCallKind::SpawnAgent => { + let payload: CollabAgentSpawnEndEvent = + serde_json::from_value(self.read_payload_json(runtime_payload)?)?; + self.end_spawn_agent_interaction(wall_time_unix_ms, tool_call_id, &payload) + } + ToolCallKind::AssignAgentTask => { + let payload: CollabAgentInteractionEndEvent = + serde_json::from_value(self.read_payload_json(runtime_payload)?)?; + self.end_message_agent_interaction( + wall_time_unix_ms, + tool_call_id, + InteractionEdgeKind::AssignAgentTask, + &payload, + ) + } + ToolCallKind::SendMessage => { + let payload: CollabAgentInteractionEndEvent = + serde_json::from_value(self.read_payload_json(runtime_payload)?)?; + self.end_message_agent_interaction( + wall_time_unix_ms, + tool_call_id, + InteractionEdgeKind::SendMessage, + &payload, + ) + } + ToolCallKind::CloseAgent => { + let payload: CollabCloseEndEvent = + serde_json::from_value(self.read_payload_json(runtime_payload)?)?; + self.upsert_close_agent_interaction( + tool_call_id, + payload.receiver_thread_id.to_string(), + Some(wall_time_unix_ms), + ) + } + ToolCallKind::ExecCommand + | ToolCallKind::WriteStdin + | ToolCallKind::ApplyPatch + | ToolCallKind::Mcp { .. } + | ToolCallKind::Web + | ToolCallKind::ImageGeneration + | ToolCallKind::WaitAgent + | ToolCallKind::Other { .. } => Ok(()), + } + } + + /// Adds the canonical tool result payload to an already reduced multi-agent edge. + pub(super) fn attach_agent_interaction_tool_result( + &mut self, + tool_call_id: &str, + result_payload: Option<&RawPayloadRef>, + ) -> Result<()> { + let Some(result_payload) = result_payload else { + return Ok(()); + }; + if let Some(edge) = self + .rollout + .interaction_edges + .values_mut() + .find(|edge| tool_call_source_matches(&edge.source, tool_call_id)) + { + push_unique( + &mut edge.carried_raw_payload_ids, + &result_payload.raw_payload_id, + ); + return Ok(()); + } + + // Agent delivery edges intentionally wait for the recipient-side + // conversation item. Tool end can arrive before that item is + // reduced, so preserve the response payload on the pending edge rather + // than dropping evidence until the delivery materializes. + if let Some(pending) = self + .pending_agent_interaction_edges + .iter_mut() + .find(|pending| tool_call_source_matches(&pending.source, tool_call_id)) + { + push_unique( + &mut pending.carried_raw_payload_ids, + &result_payload.raw_payload_id, + ); + } + Ok(()) + } + + fn end_spawn_agent_interaction( + &mut self, + wall_time_unix_ms: i64, + tool_call_id: &str, + payload: &CollabAgentSpawnEndEvent, + ) -> Result<()> { + let Some(child_thread_id) = payload.new_thread_id else { + return Ok(()); + }; + let tool_call = &self.rollout.tool_calls[tool_call_id]; + let child_thread_id = child_thread_id.to_string(); + let edge_id = spawn_edge_id(&payload.sender_thread_id.to_string(), &child_thread_id); + + self.queue_or_resolve_agent_interaction_edge(PendingAgentInteractionEdge { + edge_id, + kind: InteractionEdgeKind::SpawnAgent, + source: TraceAnchor::ToolCall { + tool_call_id: tool_call_id.to_string(), + }, + target_thread_id: child_thread_id.clone(), + message_content: payload.prompt.clone(), + unresolved_spawn_thread_id: Some(child_thread_id), + started_at_unix_ms: tool_call.execution.started_at_unix_ms, + ended_at_unix_ms: Some(wall_time_unix_ms), + carried_raw_payload_ids: self.agent_tool_payload_ids(tool_call_id)?, + }) + } + + fn end_message_agent_interaction( + &mut self, + wall_time_unix_ms: i64, + tool_call_id: &str, + edge_kind: InteractionEdgeKind, + payload: &CollabAgentInteractionEndEvent, + ) -> Result<()> { + self.queue_message_agent_interaction( + tool_call_id, + edge_kind, + payload.receiver_thread_id.to_string(), + payload.prompt.clone(), + Some(wall_time_unix_ms), + ) + } + + fn queue_message_agent_interaction( + &mut self, + tool_call_id: &str, + kind: InteractionEdgeKind, + target_thread_id: String, + message_content: String, + ended_at_unix_ms: Option, + ) -> Result<()> { + let tool_call = &self.rollout.tool_calls[tool_call_id]; + self.queue_or_resolve_agent_interaction_edge(PendingAgentInteractionEdge { + edge_id: tool_edge_id(tool_call_id), + kind, + source: TraceAnchor::ToolCall { + tool_call_id: tool_call_id.to_string(), + }, + target_thread_id, + message_content, + unresolved_spawn_thread_id: None, + started_at_unix_ms: tool_call.execution.started_at_unix_ms, + ended_at_unix_ms, + carried_raw_payload_ids: self.agent_tool_payload_ids(tool_call_id)?, + }) + } + + fn agent_tool_payload_ids(&self, tool_call_id: &str) -> Result> { + let tool_call = + self.rollout.tool_calls.get(tool_call_id).with_context(|| { + format!("agent edge referenced unknown tool call {tool_call_id}") + })?; + let mut payload_ids = Vec::new(); + if let Some(payload_id) = &tool_call.raw_invocation_payload_id { + push_unique(&mut payload_ids, payload_id); + } + for payload_id in &tool_call.raw_runtime_payload_ids { + push_unique(&mut payload_ids, payload_id); + } + if let Some(payload_id) = &tool_call.raw_result_payload_id { + push_unique(&mut payload_ids, payload_id); + } + Ok(payload_ids) + } + + fn upsert_close_agent_interaction( + &mut self, + tool_call_id: &str, + target_thread_id: String, + ended_at_unix_ms: Option, + ) -> Result<()> { + if !self.rollout.threads.contains_key(&target_thread_id) { + // A failed close can name a thread that never participated in this + // trace. Keep that evidence on the ToolCall raw payloads rather + // than creating an anchor to a non-existent reduced object. + return Ok(()); + } + let started_at_unix_ms = self + .rollout + .tool_calls + .get(tool_call_id) + .with_context(|| format!("close edge referenced unknown tool call {tool_call_id}"))? + .execution + .started_at_unix_ms; + let carried_raw_payload_ids = self.agent_tool_payload_ids(tool_call_id)?; + self.upsert_interaction_edge(InteractionEdge { + edge_id: tool_edge_id(tool_call_id), + kind: InteractionEdgeKind::CloseAgent, + source: TraceAnchor::ToolCall { + tool_call_id: tool_call_id.to_string(), + }, + target: TraceAnchor::Thread { + thread_id: target_thread_id, + }, + started_at_unix_ms, + ended_at_unix_ms, + carried_item_ids: Vec::new(), + carried_raw_payload_ids, + }) + } + + /// Queues or resolves the edge from a child completion to its parent notification. + pub(in crate::reducer) fn queue_agent_result_interaction_edge( + &mut self, + observed: ObservedAgentResultEdge, + ) -> Result<()> { + let source = if let Some(source_item_id) = self.latest_assistant_message_item_for_turn( + &observed.child_thread_id, + &observed.child_codex_turn_id, + ) { + TraceAnchor::ConversationItem { + item_id: source_item_id, + } + } else { + // Child completion is delivered from AgentStatus, not from transcript + // content. Failed or cancelled children can therefore notify the parent + // without producing a final assistant message. Anchor those edges to + // the child thread so the trace keeps the valid delivery instead of + // inventing a missing conversation item. + TraceAnchor::Thread { + thread_id: observed.child_thread_id, + } + }; + + self.queue_or_resolve_agent_interaction_edge(PendingAgentInteractionEdge { + edge_id: observed.edge_id, + kind: InteractionEdgeKind::AgentResult, + source, + target_thread_id: observed.parent_thread_id, + message_content: observed.message, + unresolved_spawn_thread_id: None, + started_at_unix_ms: observed.wall_time_unix_ms, + ended_at_unix_ms: Some(observed.wall_time_unix_ms), + carried_raw_payload_ids: observed + .carried_payload + .map(|payload| vec![payload.raw_payload_id]) + .unwrap_or_default(), + }) + } + + /// Resolves pending agent edges whose target is the newly reduced conversation item. + pub(in crate::reducer) fn resolve_pending_agent_edges_for_item( + &mut self, + item_id: &str, + ) -> Result<()> { + let Some((thread_id, message_content)) = self.inter_agent_message_item(item_id) else { + return Ok(()); + }; + let Some(pending_index) = self + .pending_agent_interaction_edges + .iter() + .position(|pending| { + pending.target_thread_id == thread_id && pending.message_content == message_content + }) + else { + return Ok(()); + }; + let pending = self.pending_agent_interaction_edges.remove(pending_index); + self.upsert_agent_interaction_edge_for_item(pending, item_id.to_string()) + } + + fn queue_or_resolve_agent_interaction_edge( + &mut self, + pending: PendingAgentInteractionEdge, + ) -> Result<()> { + if let Some(item_id) = self.find_unlinked_inter_agent_message_item( + &pending.target_thread_id, + &pending.message_content, + ) { + return self.upsert_agent_interaction_edge_for_item(pending, item_id); + } + + if let Some(existing) = self + .pending_agent_interaction_edges + .iter_mut() + .find(|existing| existing.edge_id == pending.edge_id) + { + if existing.kind != pending.kind + || existing.source != pending.source + || existing.target_thread_id != pending.target_thread_id + || existing.message_content != pending.message_content + || existing.unresolved_spawn_thread_id != pending.unresolved_spawn_thread_id + { + bail!( + "pending interaction edge {} was observed with conflicting delivery data", + pending.edge_id + ); + } + existing.started_at_unix_ms = + existing.started_at_unix_ms.min(pending.started_at_unix_ms); + existing.ended_at_unix_ms = match (existing.ended_at_unix_ms, pending.ended_at_unix_ms) + { + (Some(existing_ended), Some(pending_ended)) => { + Some(existing_ended.max(pending_ended)) + } + (None, ended) | (ended, None) => ended, + }; + extend_unique( + &mut existing.carried_raw_payload_ids, + pending.carried_raw_payload_ids, + ); + return Ok(()); + } + + self.pending_agent_interaction_edges.push(pending); + Ok(()) + } + + /// Materializes unresolved spawn edges that have a valid child-thread fallback target. + pub(in crate::reducer) fn resolve_pending_spawn_edge_fallbacks(&mut self) -> Result<()> { + let pending_edges = std::mem::take(&mut self.pending_agent_interaction_edges); + for pending in pending_edges { + let Some(child_thread_id) = pending.unresolved_spawn_thread_id else { + continue; + }; + if pending.kind != InteractionEdgeKind::SpawnAgent { + bail!( + "non-spawn interaction edge {} carried a spawn fallback target", + pending.edge_id + ); + } + if !self.rollout.threads.contains_key(&child_thread_id) { + continue; + } + + // Spawn normally resolves to the child task message because that is + // where the delegated work first becomes model-visible. A child can + // fail before that transcript item exists, but the spawned thread is + // still real and the spawning tool still created it. Preserve that + // relationship with the thread fallback instead of dropping the edge. + self.upsert_interaction_edge(InteractionEdge { + edge_id: pending.edge_id, + kind: pending.kind, + source: pending.source, + target: TraceAnchor::Thread { + thread_id: child_thread_id, + }, + started_at_unix_ms: pending.started_at_unix_ms, + ended_at_unix_ms: pending.ended_at_unix_ms, + carried_item_ids: Vec::new(), + carried_raw_payload_ids: pending.carried_raw_payload_ids, + })?; + } + Ok(()) + } + + fn upsert_agent_interaction_edge_for_item( + &mut self, + pending: PendingAgentInteractionEdge, + target_item_id: String, + ) -> Result<()> { + self.upsert_interaction_edge(InteractionEdge { + edge_id: pending.edge_id, + kind: pending.kind, + source: pending.source, + target: TraceAnchor::ConversationItem { + item_id: target_item_id.clone(), + }, + started_at_unix_ms: pending.started_at_unix_ms, + ended_at_unix_ms: pending.ended_at_unix_ms, + carried_item_ids: vec![target_item_id], + carried_raw_payload_ids: pending.carried_raw_payload_ids, + }) + } + + fn upsert_interaction_edge(&mut self, edge: InteractionEdge) -> Result<()> { + if let Some(existing) = self.rollout.interaction_edges.get_mut(&edge.edge_id) { + if existing.kind != edge.kind + || existing.source != edge.source + || existing.target != edge.target + { + bail!( + "interaction edge {} was observed with conflicting endpoints", + edge.edge_id + ); + } + existing.started_at_unix_ms = existing.started_at_unix_ms.min(edge.started_at_unix_ms); + existing.ended_at_unix_ms = match (existing.ended_at_unix_ms, edge.ended_at_unix_ms) { + (Some(existing_ended), Some(edge_ended)) => Some(existing_ended.max(edge_ended)), + (None, ended) | (ended, None) => ended, + }; + extend_unique(&mut existing.carried_item_ids, edge.carried_item_ids); + extend_unique( + &mut existing.carried_raw_payload_ids, + edge.carried_raw_payload_ids, + ); + return Ok(()); + } + + self.rollout + .interaction_edges + .insert(edge.edge_id.clone(), edge); + Ok(()) + } + + fn find_unlinked_inter_agent_message_item( + &self, + thread_id: &str, + message_content: &str, + ) -> Option { + self.rollout + .threads + .get(thread_id)? + .conversation_item_ids + .iter() + .find(|item_id| { + !self.is_interaction_edge_target_item(item_id) + && self + .inter_agent_message_item(item_id) + .is_some_and(|(_, content)| content == message_content) + }) + .cloned() + } + + fn inter_agent_message_item(&self, item_id: &str) -> Option<(String, String)> { + let item = self.rollout.conversation_items.get(item_id)?; + let (recipient_agent_path, message_content) = inter_agent_message_fields(item)?; + let thread = self.rollout.threads.get(&item.thread_id)?; + if recipient_agent_path != thread.agent_path { + return None; + } + Some((item.thread_id.clone(), message_content)) + } + + fn is_interaction_edge_target_item(&self, item_id: &str) -> bool { + self.rollout + .interaction_edges + .values() + .any(|edge| matches!(&edge.target, TraceAnchor::ConversationItem { item_id: target } if target == item_id)) + } + + fn latest_assistant_message_item_for_turn( + &self, + thread_id: &str, + codex_turn_id: &str, + ) -> Option { + self.rollout + .conversation_items + .values() + .filter(|item| { + item.thread_id == thread_id + && item.codex_turn_id.as_deref() == Some(codex_turn_id) + && item.role == ConversationRole::Assistant + && item.kind == ConversationItemKind::Message + }) + .max_by_key(|item| item.first_seen_at_unix_ms) + .map(|item| item.item_id.clone()) + } +} + +fn extend_unique(items: &mut Vec, new_items: Vec) { + for item in new_items { + if !items.iter().any(|existing| existing == &item) { + items.push(item); + } + } +} + +fn tool_edge_id(tool_call_id: &str) -> String { + format!("edge:tool:{tool_call_id}") +} + +fn tool_call_source_matches(anchor: &TraceAnchor, tool_call_id: &str) -> bool { + matches!(anchor, TraceAnchor::ToolCall { tool_call_id: source } if source == tool_call_id) +} + +fn push_unique(items: &mut Vec, item: &str) { + if !items.iter().any(|existing| existing == item) { + items.push(item.to_string()); + } +} + +fn inter_agent_message_fields(item: &ConversationItem) -> Option<(String, String)> { + // Multi-agent v2 injects mailbox deliveries as assistant messages whose + // text is serialized `InterAgentCommunication`. Treat only that exact + // transport shape as an edge target; ordinary assistant JSON must not be + // mistaken for cross-thread delivery. + if item.role != ConversationRole::Assistant || item.kind != ConversationItemKind::Message { + return None; + } + let [ConversationPart::Text { text }] = item.body.parts.as_slice() else { + return None; + }; + let communication = serde_json::from_str::(text).ok()?; + Some((communication.recipient.to_string(), communication.content)) +} + +#[cfg(test)] +#[path = "agents_tests.rs"] +mod tests; diff --git a/codex-rs/rollout-trace/src/reducer/tool/agents_tests.rs b/codex-rs/rollout-trace/src/reducer/tool/agents_tests.rs new file mode 100644 index 000000000000..deaf06dac71d --- /dev/null +++ b/codex-rs/rollout-trace/src/reducer/tool/agents_tests.rs @@ -0,0 +1,717 @@ +use pretty_assertions::assert_eq; +use serde_json::json; +use tempfile::TempDir; + +use crate::model::AgentOrigin; +use crate::model::ExecutionStatus; +use crate::model::InteractionEdgeKind; +use crate::model::RolloutStatus; +use crate::model::ToolCallKind; +use crate::model::ToolCallSummary; +use crate::model::TraceAnchor; +use crate::payload::RawPayloadKind; +use crate::payload::RawPayloadRef; +use crate::raw_event::RawToolCallRequester; +use crate::raw_event::RawTraceEventPayload; +use crate::reducer::test_support::append_completed_inference; +use crate::reducer::test_support::append_inference_request; +use crate::reducer::test_support::create_started_agent_writer; +use crate::reducer::test_support::message; +use crate::reducer::test_support::start_agent_turn; +use crate::reducer::test_support::start_thread; +use crate::reducer::test_support::start_turn_for_thread; +use crate::reducer::test_support::trace_context_for_agent; +use crate::reducer::test_support::trace_context_for_thread; +use crate::replay_bundle; +use crate::writer::TraceWriter; + +#[test] +fn child_thread_metadata_creates_spawn_origin_without_delivery_edge() -> anyhow::Result<()> { + let temp = TempDir::new()?; + let writer = TraceWriter::create( + temp.path(), + "trace-1".to_string(), + "rollout-1".to_string(), + "019d0000-0000-7000-8000-000000000002".to_string(), + )?; + let metadata = writer.write_json_payload( + RawPayloadKind::SessionMetadata, + &json!({ + "nickname": "James", + "agent_role": "explorer", + "task_name": "repo_file_counter", + "model": "gpt-test", + "session_source": { + "subagent": { + "thread_spawn": { + "parent_thread_id": "019d0000-0000-7000-8000-000000000001", + "agent_path": "/root/repo_file_counter", + "agent_nickname": "James", + "agent_role": "explorer" + } + } + } + }), + )?; + writer.append(RawTraceEventPayload::ThreadStarted { + thread_id: "019d0000-0000-7000-8000-000000000002".to_string(), + agent_path: "/root/repo_file_counter".to_string(), + metadata_payload: Some(metadata), + })?; + + let replayed = replay_bundle(temp.path())?; + let thread = &replayed.threads["019d0000-0000-7000-8000-000000000002"]; + assert_eq!(thread.nickname, Some("James".to_string())); + assert_eq!(thread.default_model, Some("gpt-test".to_string())); + assert_eq!( + thread.origin, + AgentOrigin::Spawned { + parent_thread_id: "019d0000-0000-7000-8000-000000000001".to_string(), + spawn_edge_id: "edge:spawn:019d0000-0000-7000-8000-000000000001:019d0000-0000-7000-8000-000000000002".to_string(), + task_name: "repo_file_counter".to_string(), + agent_role: "explorer".to_string(), + } + ); + assert!( + !replayed.interaction_edges.contains_key( + "edge:spawn:019d0000-0000-7000-8000-000000000001:019d0000-0000-7000-8000-000000000002" + ), + "spawn metadata identifies the child, but the delivery edge waits for the recipient \ + conversation item" + ); + + Ok(()) +} + +#[test] +fn spawn_runtime_payload_targets_delivered_child_message() -> anyhow::Result<()> { + let temp = TempDir::new()?; + let writer = create_started_agent_writer(&temp)?; + start_agent_turn(&writer, "turn-1")?; + + let spawn_payloads = append_spawn_agent_tool_lifecycle(&writer, "turn-1")?; + + // Then record the child-side model-visible task message. This is the + // preferred target because it pinpoints where the delegated work entered + // the child timeline. + start_thread( + &writer, + "019d0000-0000-7000-8000-000000000002", + "/root/repo_file_counter", + )?; + start_turn_for_thread( + &writer, + "019d0000-0000-7000-8000-000000000002", + "turn-child-1", + )?; + let delivered = inter_agent_message( + "/root", + "/root/repo_file_counter", + "count", + /*trigger_turn*/ true, + ); + append_inference_request( + &writer, + "019d0000-0000-7000-8000-000000000002", + "turn-child-1", + "inference-child-1", + vec![message("assistant", &delivered)], + )?; + + let replayed = replay_bundle(temp.path())?; + let edge = &replayed.interaction_edges["edge:spawn:019d0000-0000-7000-8000-000000000001:019d0000-0000-7000-8000-000000000002"]; + assert_eq!(edge.kind, InteractionEdgeKind::SpawnAgent); + assert_eq!( + edge.source, + TraceAnchor::ToolCall { + tool_call_id: "call-spawn".to_string() + } + ); + let target_item_id = target_conversation_item_id(&edge.target); + assert_eq!(edge.carried_item_ids, vec![target_item_id.clone()]); + assert_eq!( + replayed.conversation_items[target_item_id].thread_id, + "019d0000-0000-7000-8000-000000000002" + ); + assert_eq!( + edge.carried_raw_payload_ids, + vec![ + spawn_payloads.invocation.raw_payload_id, + spawn_payloads.begin.raw_payload_id, + spawn_payloads.end.raw_payload_id, + spawn_payloads.result.raw_payload_id, + ] + ); + + Ok(()) +} + +#[test] +fn spawn_runtime_payload_falls_back_to_child_thread_without_delivery_item() -> anyhow::Result<()> { + let temp = TempDir::new()?; + let writer = create_started_agent_writer(&temp)?; + start_agent_turn(&writer, "turn-1")?; + let spawn_payloads = append_spawn_agent_tool_lifecycle(&writer, "turn-1")?; + + // Deliberately start the child thread without appending an inference + // request containing the inter-agent task message. This reproduces the + // failure path where the child aborts before the reducer can target the + // precise child-side ConversationItem. + start_thread( + &writer, + "019d0000-0000-7000-8000-000000000002", + "/root/repo_file_counter", + )?; + + let replayed = replay_bundle(temp.path())?; + let edge = &replayed.interaction_edges["edge:spawn:019d0000-0000-7000-8000-000000000001:019d0000-0000-7000-8000-000000000002"]; + assert_eq!(edge.kind, InteractionEdgeKind::SpawnAgent); + assert_eq!( + edge.source, + TraceAnchor::ToolCall { + tool_call_id: "call-spawn".to_string() + } + ); + assert_eq!( + edge.target, + TraceAnchor::Thread { + thread_id: "019d0000-0000-7000-8000-000000000002".to_string() + } + ); + // No transcript item carried the task, so the fallback edge should not + // claim one. The raw payloads still preserve the tool evidence. + assert!(edge.carried_item_ids.is_empty()); + assert_eq!( + edge.carried_raw_payload_ids, + vec![ + spawn_payloads.invocation.raw_payload_id, + spawn_payloads.begin.raw_payload_id, + spawn_payloads.end.raw_payload_id, + spawn_payloads.result.raw_payload_id, + ] + ); + + Ok(()) +} + +#[test] +fn send_message_runtime_payload_targets_delivered_child_message() -> anyhow::Result<()> { + let temp = TempDir::new()?; + let writer = create_started_agent_writer(&temp)?; + start_agent_turn(&writer, "turn-1")?; + let invocation_payload = writer.write_json_payload( + RawPayloadKind::ToolInvocation, + &json!({ + "tool_name": "send_message", + "payload": { + "type": "function", + "arguments": "{\"target\":\"/root/child\",\"message\":\"hello\"}" + } + }), + )?; + writer.append_with_context( + trace_context_for_agent("turn-1"), + RawTraceEventPayload::ToolCallStarted { + tool_call_id: "call-send".to_string(), + model_visible_call_id: Some("call-send".to_string()), + code_mode_runtime_tool_id: None, + requester: RawToolCallRequester::Model, + kind: ToolCallKind::SendMessage, + summary: ToolCallSummary::Generic { + label: "send_message".to_string(), + input_preview: None, + output_preview: None, + }, + invocation_payload: Some(invocation_payload), + }, + )?; + let begin_payload = writer.write_json_payload( + RawPayloadKind::ToolRuntimeEvent, + &json!({ + "call_id": "call-send", + "sender_thread_id": "019d0000-0000-7000-8000-000000000001", + "receiver_thread_id": "019d0000-0000-7000-8000-000000000002", + "prompt": "hello", + "status": "running" + }), + )?; + writer.append_with_context( + trace_context_for_agent("turn-1"), + RawTraceEventPayload::ToolCallRuntimeStarted { + tool_call_id: "call-send".to_string(), + runtime_payload: begin_payload, + }, + )?; + let end_payload = writer.write_json_payload( + RawPayloadKind::ToolRuntimeEvent, + &json!({ + "call_id": "call-send", + "sender_thread_id": "019d0000-0000-7000-8000-000000000001", + "receiver_thread_id": "019d0000-0000-7000-8000-000000000002", + "prompt": "hello", + "status": "running" + }), + )?; + writer.append_with_context( + trace_context_for_agent("turn-1"), + RawTraceEventPayload::ToolCallRuntimeEnded { + tool_call_id: "call-send".to_string(), + status: ExecutionStatus::Completed, + runtime_payload: end_payload, + }, + )?; + start_thread( + &writer, + "019d0000-0000-7000-8000-000000000002", + "/root/child", + )?; + start_turn_for_thread( + &writer, + "019d0000-0000-7000-8000-000000000002", + "turn-child-1", + )?; + let delivered = + inter_agent_message("/root", "/root/child", "hello", /*trigger_turn*/ false); + append_inference_request( + &writer, + "019d0000-0000-7000-8000-000000000002", + "turn-child-1", + "inference-child-1", + vec![message("assistant", &delivered)], + )?; + + let replayed = replay_bundle(temp.path())?; + let edge = &replayed.interaction_edges["edge:tool:call-send"]; + assert_eq!(edge.kind, InteractionEdgeKind::SendMessage); + assert_eq!( + edge.source, + TraceAnchor::ToolCall { + tool_call_id: "call-send".to_string() + } + ); + let target_item_id = target_conversation_item_id(&edge.target); + assert_eq!(edge.carried_item_ids, vec![target_item_id.clone()]); + assert_eq!( + replayed.conversation_items[target_item_id].thread_id, + "019d0000-0000-7000-8000-000000000002" + ); + assert!(edge.ended_at_unix_ms.is_some()); + + Ok(()) +} + +#[test] +fn close_agent_runtime_payload_targets_thread() -> anyhow::Result<()> { + let temp = TempDir::new()?; + let writer = create_started_agent_writer(&temp)?; + start_thread( + &writer, + "019d0000-0000-7000-8000-000000000002", + "/root/child", + )?; + start_agent_turn(&writer, "turn-1")?; + let invocation_payload = writer.write_json_payload( + RawPayloadKind::ToolInvocation, + &json!({ + "tool_name": "close_agent", + "payload": { + "type": "function", + "arguments": r#"{"target":"/root/child"}"# + } + }), + )?; + writer.append_with_context( + trace_context_for_agent("turn-1"), + RawTraceEventPayload::ToolCallStarted { + tool_call_id: "call-close".to_string(), + model_visible_call_id: Some("call-close".to_string()), + code_mode_runtime_tool_id: None, + requester: RawToolCallRequester::Model, + kind: ToolCallKind::CloseAgent, + summary: ToolCallSummary::Generic { + label: "close_agent".to_string(), + input_preview: None, + output_preview: None, + }, + invocation_payload: Some(invocation_payload.clone()), + }, + )?; + let begin_payload = writer.write_json_payload( + RawPayloadKind::ToolRuntimeEvent, + &json!({ + "call_id": "call-close", + "sender_thread_id": "019d0000-0000-7000-8000-000000000001", + "receiver_thread_id": "019d0000-0000-7000-8000-000000000002" + }), + )?; + writer.append_with_context( + trace_context_for_agent("turn-1"), + RawTraceEventPayload::ToolCallRuntimeStarted { + tool_call_id: "call-close".to_string(), + runtime_payload: begin_payload.clone(), + }, + )?; + let end_payload = writer.write_json_payload( + RawPayloadKind::ToolRuntimeEvent, + &json!({ + "call_id": "call-close", + "sender_thread_id": "019d0000-0000-7000-8000-000000000001", + "receiver_thread_id": "019d0000-0000-7000-8000-000000000002", + "receiver_agent_nickname": "Scout", + "receiver_agent_role": "explorer", + "status": "running" + }), + )?; + writer.append_with_context( + trace_context_for_agent("turn-1"), + RawTraceEventPayload::ToolCallRuntimeEnded { + tool_call_id: "call-close".to_string(), + status: ExecutionStatus::Completed, + runtime_payload: end_payload.clone(), + }, + )?; + let result_payload = writer.write_json_payload( + RawPayloadKind::ToolResult, + &json!({"previous_status": "running"}), + )?; + writer.append_with_context( + trace_context_for_agent("turn-1"), + RawTraceEventPayload::ToolCallEnded { + tool_call_id: "call-close".to_string(), + status: ExecutionStatus::Completed, + result_payload: Some(result_payload.clone()), + }, + )?; + writer.append(RawTraceEventPayload::ThreadEnded { + thread_id: "019d0000-0000-7000-8000-000000000002".to_string(), + status: RolloutStatus::Completed, + })?; + + let replayed = replay_bundle(temp.path())?; + let edge = &replayed.interaction_edges["edge:tool:call-close"]; + assert_eq!(edge.kind, InteractionEdgeKind::CloseAgent); + assert_eq!( + edge.source, + TraceAnchor::ToolCall { + tool_call_id: "call-close".to_string() + } + ); + assert_eq!( + edge.target, + TraceAnchor::Thread { + thread_id: "019d0000-0000-7000-8000-000000000002".to_string() + } + ); + assert!(edge.carried_item_ids.is_empty()); + assert_eq!( + edge.carried_raw_payload_ids, + vec![ + invocation_payload.raw_payload_id, + begin_payload.raw_payload_id, + end_payload.raw_payload_id, + result_payload.raw_payload_id, + ] + ); + assert_eq!( + replayed.threads["019d0000-0000-7000-8000-000000000002"] + .execution + .status, + ExecutionStatus::Completed + ); + assert_eq!(replayed.status, RolloutStatus::Running); + + Ok(()) +} + +#[test] +fn agent_result_edge_links_child_result_to_parent_notification() -> anyhow::Result<()> { + let temp = TempDir::new()?; + let writer = create_started_agent_writer(&temp)?; + start_thread( + &writer, + "019d0000-0000-7000-8000-000000000002", + "/root/child", + )?; + start_turn_for_thread( + &writer, + "019d0000-0000-7000-8000-000000000002", + "turn-child-1", + )?; + append_completed_inference( + &writer, + "019d0000-0000-7000-8000-000000000002", + "turn-child-1", + "inference-child-1", + vec![message("assistant", "task")], + vec![message("assistant", "done")], + )?; + + let notification = "{\"agent_path\":\"/root/child\",\"status\":{\"completed\":\"done\"}}"; + let carried_payload = writer.write_json_payload( + RawPayloadKind::AgentResult, + &json!({ + "child_agent_path": "/root/child", + "message": notification, + "status": {"completed": "done"} + }), + )?; + writer.append_with_context( + trace_context_for_thread("019d0000-0000-7000-8000-000000000002", "turn-child-1"), + RawTraceEventPayload::AgentResultObserved { + edge_id: "edge:agent_result:thread-child:turn-child-1:thread-root".to_string(), + child_thread_id: "019d0000-0000-7000-8000-000000000002".to_string(), + child_codex_turn_id: "turn-child-1".to_string(), + parent_thread_id: "019d0000-0000-7000-8000-000000000001".to_string(), + message: notification.to_string(), + carried_payload: Some(carried_payload.clone()), + }, + )?; + + start_agent_turn(&writer, "turn-root-1")?; + let delivered = inter_agent_message( + "/root/child", + "/root", + notification, + /*trigger_turn*/ false, + ); + append_inference_request( + &writer, + "019d0000-0000-7000-8000-000000000001", + "turn-root-1", + "inference-root-1", + vec![message("assistant", &delivered)], + )?; + + let replayed = replay_bundle(temp.path())?; + let edge = + &replayed.interaction_edges["edge:agent_result:thread-child:turn-child-1:thread-root"]; + assert_eq!(edge.kind, InteractionEdgeKind::AgentResult); + let TraceAnchor::ConversationItem { + item_id: source_item_id, + } = &edge.source + else { + panic!("expected child result conversation item source"); + }; + assert_eq!( + text_body(&replayed.conversation_items[source_item_id]), + "done" + ); + let target_item_id = target_conversation_item_id(&edge.target); + assert_eq!( + replayed.conversation_items[target_item_id].thread_id, + "019d0000-0000-7000-8000-000000000001" + ); + assert_eq!(edge.carried_item_ids, vec![target_item_id.clone()]); + assert_eq!( + edge.carried_raw_payload_ids, + vec![carried_payload.raw_payload_id] + ); + + Ok(()) +} + +#[test] +fn agent_result_edge_falls_back_to_child_thread_without_result_message() -> anyhow::Result<()> { + let temp = TempDir::new()?; + let writer = create_started_agent_writer(&temp)?; + + // The child thread and turn exist, but there is intentionally no completed + // assistant message for this turn. Failed child tasks can still notify the + // parent through AgentStatus, so the result edge must not require a final + // transcript item from the child. + start_thread( + &writer, + "019d0000-0000-7000-8000-000000000002", + "/root/child", + )?; + start_turn_for_thread( + &writer, + "019d0000-0000-7000-8000-000000000002", + "turn-child-1", + )?; + + let notification = r#"{"agent_path":"/root/child","status":{"failed":"boom"}}"#; + let carried_payload = writer.write_json_payload( + RawPayloadKind::AgentResult, + &json!({ + "child_agent_path": "/root/child", + "message": notification, + "status": {"failed": "boom"} + }), + )?; + writer.append_with_context( + trace_context_for_thread("019d0000-0000-7000-8000-000000000002", "turn-child-1"), + RawTraceEventPayload::AgentResultObserved { + edge_id: "edge:agent_result:thread-child:turn-child-1:thread-root".to_string(), + child_thread_id: "019d0000-0000-7000-8000-000000000002".to_string(), + child_codex_turn_id: "turn-child-1".to_string(), + parent_thread_id: "019d0000-0000-7000-8000-000000000001".to_string(), + message: notification.to_string(), + carried_payload: Some(carried_payload.clone()), + }, + )?; + + // The parent does receive the failure notification as a model-visible + // mailbox item. The target should remain that precise parent-side + // ConversationItem even though the source falls back to the child thread. + start_agent_turn(&writer, "turn-root-1")?; + let delivered = inter_agent_message( + "/root/child", + "/root", + notification, + /*trigger_turn*/ false, + ); + append_inference_request( + &writer, + "019d0000-0000-7000-8000-000000000001", + "turn-root-1", + "inference-root-1", + vec![message("assistant", &delivered)], + )?; + + let replayed = replay_bundle(temp.path())?; + let edge = + &replayed.interaction_edges["edge:agent_result:thread-child:turn-child-1:thread-root"]; + assert_eq!(edge.kind, InteractionEdgeKind::AgentResult); + assert_eq!( + edge.source, + TraceAnchor::Thread { + thread_id: "019d0000-0000-7000-8000-000000000002".to_string(), + } + ); + let target_item_id = target_conversation_item_id(&edge.target); + assert_eq!( + replayed.conversation_items[target_item_id].thread_id, + "019d0000-0000-7000-8000-000000000001" + ); + assert_eq!(edge.carried_item_ids, vec![target_item_id.clone()]); + assert_eq!( + edge.carried_raw_payload_ids, + vec![carried_payload.raw_payload_id] + ); + + Ok(()) +} + +struct SpawnAgentToolPayloads { + invocation: RawPayloadRef, + begin: RawPayloadRef, + end: RawPayloadRef, + result: RawPayloadRef, +} + +fn append_spawn_agent_tool_lifecycle( + writer: &TraceWriter, + turn_id: &str, +) -> anyhow::Result { + // Keep the parent-side tool lifecycle in one place so the spawn tests can + // focus on the child-side event that decides the edge target. + let invocation = writer.write_json_payload( + RawPayloadKind::ToolInvocation, + &json!({ + "tool_name": "spawn_agent", + "payload": { + "type": "function", + "arguments": r#"{"task_name":"repo_file_counter","message":"count"}"# + } + }), + )?; + writer.append_with_context( + trace_context_for_agent(turn_id), + RawTraceEventPayload::ToolCallStarted { + tool_call_id: "call-spawn".to_string(), + model_visible_call_id: Some("call-spawn".to_string()), + code_mode_runtime_tool_id: None, + requester: RawToolCallRequester::Model, + kind: ToolCallKind::SpawnAgent, + summary: ToolCallSummary::Generic { + label: "spawn_agent".to_string(), + input_preview: None, + output_preview: None, + }, + invocation_payload: Some(invocation.clone()), + }, + )?; + + let begin = writer.write_json_payload( + RawPayloadKind::ToolRuntimeEvent, + &json!({ + "call_id": "call-spawn", + "sender_thread_id": "019d0000-0000-7000-8000-000000000001", + "prompt": "count" + }), + )?; + writer.append_with_context( + trace_context_for_agent(turn_id), + RawTraceEventPayload::ToolCallRuntimeStarted { + tool_call_id: "call-spawn".to_string(), + runtime_payload: begin.clone(), + }, + )?; + + let end = writer.write_json_payload( + RawPayloadKind::ToolRuntimeEvent, + &json!({ + "call_id": "call-spawn", + "sender_thread_id": "019d0000-0000-7000-8000-000000000001", + "new_thread_id": "019d0000-0000-7000-8000-000000000002", + "prompt": "count", + "model": "gpt-test", + "reasoning_effort": "medium", + "status": "running" + }), + )?; + writer.append_with_context( + trace_context_for_agent(turn_id), + RawTraceEventPayload::ToolCallRuntimeEnded { + tool_call_id: "call-spawn".to_string(), + status: ExecutionStatus::Completed, + runtime_payload: end.clone(), + }, + )?; + + let result = writer.write_json_payload( + RawPayloadKind::ToolResult, + &json!({"task_name": "/root/repo_file_counter"}), + )?; + writer.append_with_context( + trace_context_for_agent(turn_id), + RawTraceEventPayload::ToolCallEnded { + tool_call_id: "call-spawn".to_string(), + status: ExecutionStatus::Completed, + result_payload: Some(result.clone()), + }, + )?; + + Ok(SpawnAgentToolPayloads { + invocation, + begin, + end, + result, + }) +} + +fn inter_agent_message(author: &str, recipient: &str, content: &str, trigger_turn: bool) -> String { + json!({ + "author": author, + "recipient": recipient, + "other_recipients": [], + "content": content, + "trigger_turn": trigger_turn, + }) + .to_string() +} + +fn target_conversation_item_id(anchor: &TraceAnchor) -> &String { + let TraceAnchor::ConversationItem { item_id } = anchor else { + panic!("expected conversation item target"); + }; + item_id +} + +fn text_body(item: &crate::model::ConversationItem) -> &str { + let [crate::model::ConversationPart::Text { text }] = item.body.parts.as_slice() else { + panic!("expected single text part"); + }; + text +} diff --git a/codex-rs/rollout-trace/src/reducer/tool/terminal.rs b/codex-rs/rollout-trace/src/reducer/tool/terminal.rs new file mode 100644 index 000000000000..cdd28ec5da1e --- /dev/null +++ b/codex-rs/rollout-trace/src/reducer/tool/terminal.rs @@ -0,0 +1,606 @@ +//! Terminal reduction for exec-like tool calls. +//! +//! The raw trace records terminal activity as normal tool lifecycle events. +//! Protocol-backed exec events carry `ExecCommand*` payloads with the richest +//! runtime details. Direct tools without protocol observations, such as +//! `write_stdin`, can still form a terminal row from the canonical dispatch +//! invocation/result payloads when those payloads carry the session join key. + +use anyhow::Context; +use anyhow::Result; +use anyhow::bail; +use serde::Deserialize; +use serde_json::Value as JsonValue; + +use super::push_unique; +use crate::model::ExecutionStatus; +use crate::model::ExecutionWindow; +use crate::model::TerminalModelObservation; +use crate::model::TerminalObservationSource; +use crate::model::TerminalOperation; +use crate::model::TerminalOperationId; +use crate::model::TerminalOperationKind; +use crate::model::TerminalRequest; +use crate::model::TerminalResult; +use crate::model::TerminalSession; +use crate::model::ToolCallKind; +use crate::payload::RawPayloadRef; +use crate::raw_event::RawEventSeq; +use crate::reducer::TraceReducer; + +impl TraceReducer { + /// Starts a terminal operation from a canonical dispatch invocation payload. + /// + /// This is currently needed for direct tools such as write-stdin that do not + /// emit a richer protocol runtime-begin event with the terminal join key. + pub(in crate::reducer) fn start_terminal_operation_from_invocation( + &mut self, + seq: RawEventSeq, + wall_time_unix_ms: i64, + thread_id: &str, + tool_call_id: &str, + kind: &ToolCallKind, + invocation_payload: Option<&RawPayloadRef>, + ) -> Result> { + if !matches!(kind, ToolCallKind::WriteStdin) { + return Ok(None); + } + let operation_kind = TerminalOperationKind::WriteStdin; + let Some(invocation_payload) = invocation_payload else { + // Payload writes are best-effort in the live recorder. If the + // canonical invocation is missing, keep the ToolCall but avoid + // fabricating a lossy terminal row. + return Ok(None); + }; + + let payload = self.read_payload_json(invocation_payload)?; + let request = parse_dispatch_terminal_request(payload).with_context(|| { + format!( + "parse terminal invocation payload {} as dispatch payload", + invocation_payload.raw_payload_id + ) + })?; + self.insert_terminal_operation(TerminalOperationStart { + seq, + wall_time_unix_ms, + thread_id, + tool_call_id, + operation_kind, + raw_payload: invocation_payload, + request, + }) + } + + /// Starts a terminal operation from a protocol runtime-begin payload. + pub(in crate::reducer) fn start_terminal_operation_from_runtime( + &mut self, + seq: RawEventSeq, + wall_time_unix_ms: i64, + thread_id: &str, + tool_call_id: &str, + kind: &ToolCallKind, + runtime_payload: &RawPayloadRef, + ) -> Result> { + let Some(operation_kind) = terminal_operation_kind(kind) else { + return Ok(None); + }; + + let payload = self.read_payload_json(runtime_payload)?; + let payload: ExecCommandBeginPayload = + serde_json::from_value(payload).with_context(|| { + format!( + "parse terminal runtime start payload {}", + runtime_payload.raw_payload_id + ) + })?; + let request = parse_protocol_terminal_request(payload, &operation_kind); + self.insert_terminal_operation(TerminalOperationStart { + seq, + wall_time_unix_ms, + thread_id, + tool_call_id, + operation_kind, + raw_payload: runtime_payload, + request, + }) + } + + fn insert_terminal_operation( + &mut self, + start: TerminalOperationStart<'_>, + ) -> Result> { + let operation_id = self.next_terminal_operation_id(); + let ParsedTerminalRequest { + terminal_id, + request, + } = start.request; + + self.rollout.terminal_operations.insert( + operation_id.clone(), + TerminalOperation { + operation_id: operation_id.clone(), + terminal_id: terminal_id.clone(), + tool_call_id: start.tool_call_id.to_string(), + kind: start.operation_kind, + execution: ExecutionWindow { + started_at_unix_ms: start.wall_time_unix_ms, + started_seq: start.seq, + ended_at_unix_ms: None, + ended_seq: None, + status: ExecutionStatus::Running, + }, + request, + result: None, + model_observations: Vec::new(), + raw_payload_ids: vec![start.raw_payload.raw_payload_id.clone()], + }, + ); + + if let Some(terminal_id) = terminal_id { + self.ensure_terminal_session( + start.thread_id, + &terminal_id, + &operation_id, + start.wall_time_unix_ms, + start.seq, + )?; + } + + Ok(Some(operation_id)) + } + + /// Completes the terminal operation associated with a tool call, if one exists. + /// + /// Non-terminal tools flow through the same generic tool lifecycle, so callers + /// may invoke this unconditionally and receive Ok for unrelated tool kinds. + pub(in crate::reducer) fn end_terminal_operation( + &mut self, + seq: RawEventSeq, + wall_time_unix_ms: i64, + thread_id: &str, + operation_id: &str, + status: ExecutionStatus, + response_payload: Option<&RawPayloadRef>, + ) -> Result<()> { + let Some(operation_kind) = self + .rollout + .terminal_operations + .get(operation_id) + .map(|operation| operation.kind.clone()) + else { + bail!("terminal end referenced unknown operation {operation_id}"); + }; + let response = response_payload + .map(|payload| { + let value = self.read_payload_json(payload)?; + let response = parse_terminal_response_payload( + value, + &operation_kind, + &payload.raw_payload_id, + )?; + Ok::<_, anyhow::Error>((payload.raw_payload_id.clone(), response)) + }) + .transpose()?; + + let (terminal_id, started_at_unix_ms, started_seq) = { + let Some(operation) = self.rollout.terminal_operations.get_mut(operation_id) else { + bail!("terminal end referenced unknown operation {operation_id}"); + }; + operation.execution.ended_at_unix_ms = Some(wall_time_unix_ms); + operation.execution.ended_seq = Some(seq); + operation.execution.status = status; + + if let Some((raw_payload_id, response)) = response { + push_unique(&mut operation.raw_payload_ids, &raw_payload_id); + // If begin and end both report a process id they must name the + // same terminal. If begin omitted it, the end event completes + // the session join key for this operation. + match (&operation.terminal_id, response.terminal_id.as_deref()) { + (Some(existing), Some(process_id)) if existing != process_id => { + bail!( + "terminal operation {operation_id} changed process id from \ + {existing} to {process_id}" + ); + } + (None, Some(process_id)) => { + operation.terminal_id = Some(process_id.to_string()); + } + (Some(_), Some(_)) | (Some(_), None) | (None, None) => {} + } + operation.result = Some(response.result); + } + + ( + operation.terminal_id.clone(), + operation.execution.started_at_unix_ms, + operation.execution.started_seq, + ) + }; + + if let Some(terminal_id) = terminal_id { + self.ensure_terminal_session( + thread_id, + &terminal_id, + operation_id, + started_at_unix_ms, + started_seq, + )?; + } + + Ok(()) + } + + fn ensure_terminal_session( + &mut self, + thread_id: &str, + terminal_id: &str, + operation_id: &str, + started_at_unix_ms: i64, + started_seq: RawEventSeq, + ) -> Result<()> { + if !self.rollout.terminal_sessions.contains_key(terminal_id) { + self.rollout.terminal_sessions.insert( + terminal_id.to_string(), + TerminalSession { + terminal_id: terminal_id.to_string(), + thread_id: thread_id.to_string(), + created_by_operation_id: operation_id.to_string(), + operation_ids: Vec::new(), + execution: ExecutionWindow { + started_at_unix_ms, + started_seq, + // Current raw events do not report a terminal/session + // shutdown boundary, so the session remains open even + // after individual operations complete. + ended_at_unix_ms: None, + ended_seq: None, + status: ExecutionStatus::Running, + }, + }, + ); + } + + let Some(session) = self.rollout.terminal_sessions.get_mut(terminal_id) else { + bail!("terminal session {terminal_id} disappeared during reduction"); + }; + if session.thread_id != thread_id { + bail!( + "terminal session {terminal_id} belongs to thread {}, not {thread_id}", + session.thread_id + ); + } + push_unique(&mut session.operation_ids, operation_id); + Ok(()) + } + + /// Mirrors model-visible tool items onto the terminal observation view. + /// + /// Runtime terminal rows are useful on their own, but the model-visible call + /// and output item ids let viewers jump between transcript and terminal timelines. + pub(in crate::reducer) fn sync_terminal_model_observation( + &mut self, + tool_call_id: &str, + ) -> Result<()> { + let Some(tool_call) = self.rollout.tool_calls.get(tool_call_id) else { + bail!("tool call {tool_call_id} disappeared during terminal observation linking"); + }; + let Some(operation_id) = tool_call.terminal_operation_id.clone() else { + return Ok(()); + }; + let call_item_ids = tool_call.model_visible_call_item_ids.clone(); + let output_item_ids = tool_call.model_visible_output_item_ids.clone(); + if call_item_ids.is_empty() && output_item_ids.is_empty() { + return Ok(()); + } + + let Some(operation) = self.rollout.terminal_operations.get_mut(&operation_id) else { + bail!("terminal operation {operation_id} disappeared during observation linking"); + }; + // A terminal result and a model-visible tool output are intentionally + // separate: the former is what the runtime saw, the latter is what later + // inference payloads prove was shown back to the model. + if let Some(observation) = operation + .model_observations + .iter_mut() + .find(|observation| observation.source == TerminalObservationSource::DirectToolCall) + { + observation.call_item_ids = call_item_ids; + observation.output_item_ids = output_item_ids; + } else { + operation.model_observations.push(TerminalModelObservation { + call_item_ids, + output_item_ids, + source: TerminalObservationSource::DirectToolCall, + }); + } + Ok(()) + } + + fn next_terminal_operation_id(&mut self) -> TerminalOperationId { + let ordinal = self.next_terminal_operation_ordinal; + self.next_terminal_operation_ordinal += 1; + format!("terminal_operation:{ordinal}") + } +} + +fn terminal_operation_kind(kind: &ToolCallKind) -> Option { + match kind { + ToolCallKind::ExecCommand => Some(TerminalOperationKind::ExecCommand), + ToolCallKind::WriteStdin => Some(TerminalOperationKind::WriteStdin), + ToolCallKind::ApplyPatch + | ToolCallKind::Mcp { .. } + | ToolCallKind::Web + | ToolCallKind::ImageGeneration + | ToolCallKind::SpawnAgent + | ToolCallKind::AssignAgentTask + | ToolCallKind::SendMessage + | ToolCallKind::WaitAgent + | ToolCallKind::CloseAgent + | ToolCallKind::Other { .. } => None, + } +} + +struct TerminalOperationStart<'a> { + seq: RawEventSeq, + wall_time_unix_ms: i64, + thread_id: &'a str, + tool_call_id: &'a str, + operation_kind: TerminalOperationKind, + raw_payload: &'a RawPayloadRef, + request: ParsedTerminalRequest, +} + +struct ParsedTerminalRequest { + terminal_id: Option, + request: TerminalRequest, +} + +struct ParsedTerminalResponse { + terminal_id: Option, + result: TerminalResult, +} + +fn parse_protocol_terminal_request( + payload: ExecCommandBeginPayload, + operation_kind: &TerminalOperationKind, +) -> ParsedTerminalRequest { + // Startup/poll paths usually include a process id at begin time, but plain + // exec starts may only learn it in the matching end event. + let terminal_id = payload.process_id.clone(); + let request = match operation_kind { + TerminalOperationKind::ExecCommand => TerminalRequest::ExecCommand { + display_command: payload.command.join(" "), + command: payload.command, + cwd: payload.cwd, + yield_time_ms: None, + max_output_tokens: None, + }, + TerminalOperationKind::WriteStdin => TerminalRequest::WriteStdin { + stdin: payload.interaction_input.unwrap_or_default(), + yield_time_ms: None, + max_output_tokens: None, + }, + }; + ParsedTerminalRequest { + terminal_id, + request, + } +} + +fn parse_dispatch_terminal_request(value: JsonValue) -> Result { + let payload: DispatchedToolTraceRequestPayload = serde_json::from_value(value)?; + if payload.tool_name != "write_stdin" { + bail!( + "dispatch terminal request is for {}, not write_stdin", + payload.tool_name + ); + } + if payload.payload.kind != "function" { + bail!( + "write_stdin dispatch payload used unsupported {} payload", + payload.payload.kind + ); + } + let arguments = payload + .payload + .arguments + .context("write_stdin dispatch payload omitted function arguments")?; + let args: DispatchedWriteStdinArgs = serde_json::from_str(&arguments) + .context("parse write_stdin dispatch function arguments")?; + let terminal_id = terminal_id_from_json(&args.session_id) + .context("write_stdin dispatch payload omitted session_id")?; + + Ok(ParsedTerminalRequest { + terminal_id: Some(terminal_id), + request: TerminalRequest::WriteStdin { + stdin: args.chars, + yield_time_ms: args.yield_time_ms, + max_output_tokens: args.max_output_tokens, + }, + }) +} + +fn parse_terminal_response_payload( + value: JsonValue, + operation_kind: &TerminalOperationKind, + raw_payload_id: &str, +) -> Result { + match operation_kind { + TerminalOperationKind::ExecCommand => { + let payload = serde_json::from_value::(value) + .with_context(|| format!("parse exec terminal response {raw_payload_id}"))?; + Ok(parse_protocol_terminal_response(payload)) + } + TerminalOperationKind::WriteStdin => { + match serde_json::from_value::(value.clone()) { + Ok(payload) => Ok(parse_protocol_terminal_response(payload)), + Err(protocol_err) => parse_dispatch_terminal_response(value).with_context(|| { + format!( + "parse write_stdin terminal response {raw_payload_id} as protocol payload \ + ({protocol_err}) or dispatch payload" + ) + }), + } + } + } +} + +fn parse_protocol_terminal_response(payload: ExecCommandEndPayload) -> ParsedTerminalResponse { + ParsedTerminalResponse { + terminal_id: payload.process_id, + result: TerminalResult { + exit_code: Some(payload.exit_code), + stdout: payload.stdout, + stderr: payload.stderr, + formatted_output: Some(payload.formatted_output), + original_token_count: None, + chunk_id: None, + }, + } +} + +fn parse_dispatch_terminal_response(value: JsonValue) -> Result { + let payload: DispatchedToolTraceResponsePayload = serde_json::from_value(value)?; + let result = match payload { + DispatchedToolTraceResponsePayload::DirectResponse { response_item } => { + let output = response_item + .get("output") + .and_then(json_text_content) + .unwrap_or_else(|| response_item.to_string()); + TerminalResult { + exit_code: None, + stdout: output.clone(), + stderr: String::new(), + formatted_output: Some(output), + original_token_count: None, + chunk_id: None, + } + } + DispatchedToolTraceResponsePayload::CodeModeResponse { value } => { + // Code-mode returns the JavaScript-facing tool value, not the text + // shown to the model. For write_stdin that value is the structured + // unified-exec result, so keep ToolCall.raw_result_payload_id as the + // raw boundary while projecting terminal-specific fields here. + parse_code_mode_exec_result(value) + } + DispatchedToolTraceResponsePayload::Error { error } => TerminalResult { + exit_code: None, + stdout: String::new(), + stderr: error.clone(), + formatted_output: Some(error), + original_token_count: None, + chunk_id: None, + }, + }; + Ok(ParsedTerminalResponse { + terminal_id: None, + result, + }) +} + +fn parse_code_mode_exec_result(value: JsonValue) -> TerminalResult { + match serde_json::from_value::(value.clone()) { + Ok(result) => TerminalResult { + exit_code: result.exit_code, + stdout: result.output.clone(), + stderr: String::new(), + formatted_output: Some(result.output), + original_token_count: result.original_token_count, + chunk_id: result.chunk_id, + }, + Err(_) => { + let output = json_text_content(&value).unwrap_or_else(|| value.to_string()); + TerminalResult { + exit_code: None, + stdout: output.clone(), + stderr: String::new(), + formatted_output: Some(output), + original_token_count: None, + chunk_id: None, + } + } + } +} + +fn json_text_content(value: &JsonValue) -> Option { + match value { + JsonValue::String(text) => Some(text.clone()), + JsonValue::Array(items) => { + let text = items + .iter() + .filter_map(|item| item.get("text").and_then(JsonValue::as_str)) + .collect::>() + .join("\n"); + (!text.is_empty()).then_some(text) + } + JsonValue::Null => None, + other => Some(other.to_string()), + } +} + +fn terminal_id_from_json(value: &JsonValue) -> Option { + match value { + JsonValue::String(value) if !value.is_empty() => Some(value.clone()), + JsonValue::Number(value) => Some(value.to_string()), + _ => None, + } +} + +#[derive(Deserialize)] +struct ExecCommandBeginPayload { + process_id: Option, + command: Vec, + cwd: String, + interaction_input: Option, +} + +#[derive(Deserialize)] +struct ExecCommandEndPayload { + process_id: Option, + stdout: String, + stderr: String, + exit_code: i32, + formatted_output: String, +} + +#[derive(Deserialize)] +struct DispatchedToolTraceRequestPayload { + tool_name: String, + payload: DispatchedToolPayload, +} + +#[derive(Deserialize)] +struct DispatchedToolPayload { + #[serde(rename = "type")] + kind: String, + arguments: Option, +} + +#[derive(Deserialize)] +struct DispatchedWriteStdinArgs { + session_id: JsonValue, + #[serde(default)] + chars: String, + yield_time_ms: Option, + max_output_tokens: Option, +} + +#[derive(Deserialize)] +#[serde(rename_all = "snake_case", tag = "type")] +enum DispatchedToolTraceResponsePayload { + DirectResponse { response_item: JsonValue }, + CodeModeResponse { value: JsonValue }, + Error { error: String }, +} + +#[derive(Deserialize)] +struct CodeModeExecResult { + chunk_id: Option, + exit_code: Option, + original_token_count: Option, + output: String, +} + +#[cfg(test)] +#[path = "terminal_tests.rs"] +mod tests; diff --git a/codex-rs/rollout-trace/src/reducer/tool/terminal_tests.rs b/codex-rs/rollout-trace/src/reducer/tool/terminal_tests.rs new file mode 100644 index 000000000000..ddf76fe5a9d8 --- /dev/null +++ b/codex-rs/rollout-trace/src/reducer/tool/terminal_tests.rs @@ -0,0 +1,580 @@ +use pretty_assertions::assert_eq; +use serde_json::json; +use tempfile::TempDir; + +use crate::model::ExecutionStatus; +use crate::model::ExecutionWindow; +use crate::model::TerminalModelObservation; +use crate::model::TerminalObservationSource; +use crate::model::TerminalOperation; +use crate::model::TerminalOperationKind; +use crate::model::TerminalRequest; +use crate::model::TerminalResult; +use crate::model::TerminalSession; +use crate::model::ToolCallKind; +use crate::model::ToolCallSummary; +use crate::payload::RawPayloadKind; +use crate::raw_event::RawTraceEventPayload; +use crate::reducer::test_support::create_started_writer; +use crate::reducer::test_support::generic_summary; +use crate::reducer::test_support::message; +use crate::reducer::test_support::start_turn; +use crate::reducer::test_support::trace_context; +use crate::replay_bundle; +use crate::writer::TraceWriter; + +#[test] +fn exec_tool_reduces_to_terminal_operation_and_session() -> anyhow::Result<()> { + let temp = TempDir::new()?; + let writer = create_started_writer(&temp)?; + start_turn(&writer, "turn-1")?; + append_inference_with_tool_call(&writer)?; + + let invocation_payload = writer.write_json_payload( + RawPayloadKind::ToolInvocation, + &json!({ + "tool_name": "exec_command", + "tool_namespace": null, + "payload": { + "type": "function", + "arguments": "{\"cmd\":\"cargo test\"}" + } + }), + )?; + let invocation_payload_id = invocation_payload.raw_payload_id.clone(); + let _tool_start = writer.append_with_context( + trace_context("turn-1"), + RawTraceEventPayload::ToolCallStarted { + tool_call_id: "tool-1".to_string(), + model_visible_call_id: Some("call-1".to_string()), + code_mode_runtime_tool_id: None, + requester: crate::raw_event::RawToolCallRequester::Model, + kind: ToolCallKind::ExecCommand, + summary: generic_summary("exec_command"), + invocation_payload: Some(invocation_payload), + }, + )?; + + let runtime_start_payload = writer.write_json_payload( + RawPayloadKind::ToolRuntimeEvent, + &json!({ + "call_id": "tool-1", + "turn_id": "turn-1", + "command": ["cargo", "test"], + "cwd": "/repo" + }), + )?; + let runtime_start_payload_id = runtime_start_payload.raw_payload_id.clone(); + let runtime_start = writer.append_with_context( + trace_context("turn-1"), + RawTraceEventPayload::ToolCallRuntimeStarted { + tool_call_id: "tool-1".to_string(), + runtime_payload: runtime_start_payload, + }, + )?; + + let runtime_end_payload = writer.write_json_payload( + RawPayloadKind::ToolRuntimeEvent, + &json!({ + "call_id": "tool-1", + "process_id": "pty-1", + "turn_id": "turn-1", + "command": ["cargo", "test"], + "cwd": "/repo", + "stdout": "ok\n", + "stderr": "", + "exit_code": 0, + "formatted_output": "ok\n", + "status": "completed" + }), + )?; + let runtime_end_payload_id = runtime_end_payload.raw_payload_id.clone(); + let runtime_end = writer.append_with_context( + trace_context("turn-1"), + RawTraceEventPayload::ToolCallRuntimeEnded { + tool_call_id: "tool-1".to_string(), + status: ExecutionStatus::Completed, + runtime_payload: runtime_end_payload, + }, + )?; + + let result_payload = writer.write_json_payload( + RawPayloadKind::ToolResult, + &json!({ + "type": "direct_response", + "response_item": { + "type": "function_call_output", + "call_id": "call-1", + "output": "ok\n" + } + }), + )?; + let result_payload_id = result_payload.raw_payload_id.clone(); + writer.append_with_context( + trace_context("turn-1"), + RawTraceEventPayload::ToolCallEnded { + tool_call_id: "tool-1".to_string(), + status: ExecutionStatus::Completed, + result_payload: Some(result_payload), + }, + )?; + + start_turn(&writer, "turn-2")?; + append_followup_with_tool_output(&writer)?; + + let rollout = replay_bundle(temp.path())?; + let operation_id = "terminal_operation:1".to_string(); + let output_item_id = rollout.inference_calls["inference-2"] + .request_item_ids + .last() + .expect("tool output item") + .clone(); + + assert_eq!( + rollout.tool_calls["tool-1"].terminal_operation_id, + Some(operation_id.clone()), + ); + assert_eq!( + rollout.tool_calls["tool-1"].raw_invocation_payload_id, + Some(invocation_payload_id), + ); + assert_eq!( + rollout.tool_calls["tool-1"].raw_result_payload_id, + Some(result_payload_id), + ); + assert_eq!( + rollout.tool_calls["tool-1"].raw_runtime_payload_ids, + vec![ + runtime_start_payload_id.clone(), + runtime_end_payload_id.clone() + ], + ); + assert_eq!( + rollout.tool_calls["tool-1"].summary, + ToolCallSummary::Terminal { + operation_id: operation_id.clone(), + }, + ); + assert_eq!( + rollout.terminal_operations[&operation_id], + TerminalOperation { + operation_id: operation_id.clone(), + terminal_id: Some("pty-1".to_string()), + tool_call_id: "tool-1".to_string(), + kind: TerminalOperationKind::ExecCommand, + execution: ExecutionWindow { + started_at_unix_ms: runtime_start.wall_time_unix_ms, + started_seq: runtime_start.seq, + ended_at_unix_ms: Some(runtime_end.wall_time_unix_ms), + ended_seq: Some(runtime_end.seq), + status: ExecutionStatus::Completed, + }, + request: TerminalRequest::ExecCommand { + command: vec!["cargo".to_string(), "test".to_string()], + display_command: "cargo test".to_string(), + cwd: "/repo".to_string(), + yield_time_ms: None, + max_output_tokens: None, + }, + result: Some(TerminalResult { + exit_code: Some(0), + stdout: "ok\n".to_string(), + stderr: String::new(), + formatted_output: Some("ok\n".to_string()), + original_token_count: None, + chunk_id: None, + }), + model_observations: vec![TerminalModelObservation { + call_item_ids: rollout.inference_calls["inference-1"] + .response_item_ids + .clone(), + output_item_ids: vec![output_item_id], + source: TerminalObservationSource::DirectToolCall, + }], + raw_payload_ids: vec![runtime_start_payload_id, runtime_end_payload_id], + }, + ); + assert_eq!( + rollout.terminal_sessions["pty-1"], + TerminalSession { + terminal_id: "pty-1".to_string(), + thread_id: "thread-root".to_string(), + created_by_operation_id: operation_id.clone(), + operation_ids: vec![operation_id], + execution: ExecutionWindow { + started_at_unix_ms: runtime_start.wall_time_unix_ms, + started_seq: runtime_start.seq, + ended_at_unix_ms: None, + ended_seq: None, + status: ExecutionStatus::Running, + }, + }, + ); + + Ok(()) +} + +#[test] +fn write_stdin_operation_reuses_existing_terminal_session() -> anyhow::Result<()> { + let temp = TempDir::new()?; + let writer = create_started_writer(&temp)?; + start_turn(&writer, "turn-1")?; + + let startup_payload = writer.write_json_payload( + RawPayloadKind::ToolRuntimeEvent, + &json!({ + "call_id": "tool-start", + "process_id": "pty-1", + "turn_id": "turn-1", + "command": ["bash"], + "cwd": "/repo" + }), + )?; + writer.append_with_context( + trace_context("turn-1"), + RawTraceEventPayload::ToolCallStarted { + tool_call_id: "tool-start".to_string(), + model_visible_call_id: None, + code_mode_runtime_tool_id: None, + requester: crate::raw_event::RawToolCallRequester::Model, + kind: ToolCallKind::ExecCommand, + summary: generic_summary("exec_command"), + invocation_payload: None, + }, + )?; + writer.append_with_context( + trace_context("turn-1"), + RawTraceEventPayload::ToolCallRuntimeStarted { + tool_call_id: "tool-start".to_string(), + runtime_payload: startup_payload, + }, + )?; + + let stdin_payload = writer.write_json_payload( + RawPayloadKind::ToolRuntimeEvent, + &json!({ + "call_id": "tool-stdin", + "process_id": "pty-1", + "turn_id": "turn-1", + "command": ["bash"], + "cwd": "/repo", + "interaction_input": "echo hi\n" + }), + )?; + let _stdin_start = writer.append_with_context( + trace_context("turn-1"), + RawTraceEventPayload::ToolCallStarted { + tool_call_id: "tool-stdin".to_string(), + model_visible_call_id: None, + code_mode_runtime_tool_id: None, + requester: crate::raw_event::RawToolCallRequester::Model, + kind: ToolCallKind::WriteStdin, + summary: generic_summary("write_stdin"), + invocation_payload: None, + }, + )?; + let stdin_runtime_start = writer.append_with_context( + trace_context("turn-1"), + RawTraceEventPayload::ToolCallRuntimeStarted { + tool_call_id: "tool-stdin".to_string(), + runtime_payload: stdin_payload, + }, + )?; + + let rollout = replay_bundle(temp.path())?; + let startup_operation_id = "terminal_operation:1".to_string(); + let stdin_operation_id = "terminal_operation:2".to_string(); + + assert_eq!( + rollout.terminal_sessions["pty-1"].operation_ids, + vec![startup_operation_id, stdin_operation_id.clone()], + ); + assert_eq!( + rollout.terminal_operations[&stdin_operation_id], + TerminalOperation { + operation_id: stdin_operation_id.clone(), + terminal_id: Some("pty-1".to_string()), + tool_call_id: "tool-stdin".to_string(), + kind: TerminalOperationKind::WriteStdin, + execution: ExecutionWindow { + started_at_unix_ms: stdin_runtime_start.wall_time_unix_ms, + started_seq: stdin_runtime_start.seq, + ended_at_unix_ms: None, + ended_seq: None, + status: ExecutionStatus::Running, + }, + request: TerminalRequest::WriteStdin { + stdin: "echo hi\n".to_string(), + yield_time_ms: None, + max_output_tokens: None, + }, + result: None, + model_observations: Vec::new(), + raw_payload_ids: vec!["raw_payload:2".to_string()], + }, + ); + + Ok(()) +} + +#[test] +fn dispatch_write_stdin_payload_reduces_to_terminal_operation() -> anyhow::Result<()> { + let temp = TempDir::new()?; + let writer = create_started_writer(&temp)?; + start_turn(&writer, "turn-1")?; + + let request_payload = writer.write_json_payload( + RawPayloadKind::ToolInvocation, + &json!({ + "tool_name": "write_stdin", + "tool_namespace": null, + "payload": { + "type": "function", + "arguments": json!({ + "session_id": 123, + "chars": "echo hi\n", + "yield_time_ms": 250, + "max_output_tokens": 2000 + }).to_string() + } + }), + )?; + let request_payload_id = request_payload.raw_payload_id.clone(); + let tool_start = writer.append_with_context( + trace_context("turn-1"), + RawTraceEventPayload::ToolCallStarted { + tool_call_id: "tool-stdin".to_string(), + model_visible_call_id: Some("call-stdin".to_string()), + code_mode_runtime_tool_id: None, + requester: crate::raw_event::RawToolCallRequester::Model, + kind: ToolCallKind::WriteStdin, + summary: generic_summary("write_stdin"), + invocation_payload: Some(request_payload), + }, + )?; + + let response_payload = writer.write_json_payload( + RawPayloadKind::ToolResult, + &json!({ + "type": "direct_response", + "response_item": { + "type": "function_call_output", + "call_id": "call-stdin", + "output": "hi\n" + } + }), + )?; + let response_payload_id = response_payload.raw_payload_id.clone(); + let tool_end = writer.append_with_context( + trace_context("turn-1"), + RawTraceEventPayload::ToolCallEnded { + tool_call_id: "tool-stdin".to_string(), + status: ExecutionStatus::Completed, + result_payload: Some(response_payload), + }, + )?; + + let rollout = replay_bundle(temp.path())?; + let operation_id = "terminal_operation:1".to_string(); + + assert_eq!( + rollout.tool_calls["tool-stdin"].terminal_operation_id, + Some(operation_id.clone()), + ); + assert_eq!( + rollout.tool_calls["tool-stdin"].summary, + ToolCallSummary::Terminal { + operation_id: operation_id.clone(), + }, + ); + assert_eq!( + rollout.terminal_operations[&operation_id], + TerminalOperation { + operation_id: operation_id.clone(), + terminal_id: Some("123".to_string()), + tool_call_id: "tool-stdin".to_string(), + kind: TerminalOperationKind::WriteStdin, + execution: ExecutionWindow { + started_at_unix_ms: tool_start.wall_time_unix_ms, + started_seq: tool_start.seq, + ended_at_unix_ms: Some(tool_end.wall_time_unix_ms), + ended_seq: Some(tool_end.seq), + status: ExecutionStatus::Completed, + }, + request: TerminalRequest::WriteStdin { + stdin: "echo hi\n".to_string(), + yield_time_ms: Some(250), + max_output_tokens: Some(2000), + }, + result: Some(TerminalResult { + exit_code: None, + stdout: "hi\n".to_string(), + stderr: String::new(), + formatted_output: Some("hi\n".to_string()), + original_token_count: None, + chunk_id: None, + }), + model_observations: Vec::new(), + raw_payload_ids: vec![request_payload_id, response_payload_id], + }, + ); + assert_eq!( + rollout.terminal_sessions["123"], + TerminalSession { + terminal_id: "123".to_string(), + thread_id: "thread-root".to_string(), + created_by_operation_id: operation_id.clone(), + operation_ids: vec![operation_id], + execution: ExecutionWindow { + started_at_unix_ms: tool_start.wall_time_unix_ms, + started_seq: tool_start.seq, + ended_at_unix_ms: None, + ended_seq: None, + status: ExecutionStatus::Running, + }, + }, + ); + + Ok(()) +} + +#[test] +fn code_mode_write_stdin_result_projects_structured_exec_fields() -> anyhow::Result<()> { + let temp = TempDir::new()?; + let writer = create_started_writer(&temp)?; + start_turn(&writer, "turn-1")?; + + let request_payload = writer.write_json_payload( + RawPayloadKind::ToolInvocation, + &json!({ + "tool_name": "write_stdin", + "tool_namespace": null, + "payload": { + "type": "function", + "arguments": json!({ + "session_id": 456, + "chars": "", + "yield_time_ms": 1000, + "max_output_tokens": 4000 + }).to_string() + } + }), + )?; + let response_payload = writer.write_json_payload( + RawPayloadKind::ToolResult, + &json!({ + "type": "code_mode_response", + "value": { + "chunk_id": "abc123", + "wall_time_seconds": 1.25, + "exit_code": 0, + "original_token_count": 3, + "output": "done\n" + } + }), + )?; + writer.append_with_context( + trace_context("turn-1"), + RawTraceEventPayload::CodeCellStarted { + runtime_cell_id: "cell-1".to_string(), + model_visible_call_id: "call-code".to_string(), + source_js: "await tools.write_stdin({ chars: '' })".to_string(), + }, + )?; + writer.append_with_context( + trace_context("turn-1"), + RawTraceEventPayload::ToolCallStarted { + tool_call_id: "tool-stdin".to_string(), + model_visible_call_id: None, + code_mode_runtime_tool_id: Some("runtime-tool-1".to_string()), + requester: crate::raw_event::RawToolCallRequester::CodeCell { + runtime_cell_id: "cell-1".to_string(), + }, + kind: ToolCallKind::WriteStdin, + summary: generic_summary("write_stdin"), + invocation_payload: Some(request_payload), + }, + )?; + writer.append_with_context( + trace_context("turn-1"), + RawTraceEventPayload::ToolCallEnded { + tool_call_id: "tool-stdin".to_string(), + status: ExecutionStatus::Completed, + result_payload: Some(response_payload), + }, + )?; + + let rollout = replay_bundle(temp.path())?; + assert_eq!( + rollout.terminal_operations["terminal_operation:1"].result, + Some(TerminalResult { + exit_code: Some(0), + stdout: "done\n".to_string(), + stderr: String::new(), + formatted_output: Some("done\n".to_string()), + original_token_count: Some(3), + chunk_id: Some("abc123".to_string()), + }), + ); + + Ok(()) +} + +fn append_inference_with_tool_call(writer: &TraceWriter) -> anyhow::Result<()> { + let request = writer.write_json_payload( + RawPayloadKind::InferenceRequest, + &json!({ + "input": [message("user", "run tests")] + }), + )?; + writer.append(RawTraceEventPayload::InferenceStarted { + inference_call_id: "inference-1".to_string(), + thread_id: "thread-root".to_string(), + codex_turn_id: "turn-1".to_string(), + model: "gpt-test".to_string(), + provider_name: "test-provider".to_string(), + request_payload: request, + })?; + + let response = writer.write_json_payload( + RawPayloadKind::InferenceResponse, + &json!({ + "response_id": "resp-1", + "output_items": [{ + "type": "function_call", + "name": "exec_command", + "arguments": "{\"cmd\":\"cargo test\"}", + "call_id": "call-1" + }] + }), + )?; + writer.append(RawTraceEventPayload::InferenceCompleted { + inference_call_id: "inference-1".to_string(), + response_id: Some("resp-1".to_string()), + response_payload: response, + })?; + Ok(()) +} + +fn append_followup_with_tool_output(writer: &TraceWriter) -> anyhow::Result<()> { + let request = writer.write_json_payload( + RawPayloadKind::InferenceRequest, + &json!({ + "previous_response_id": "resp-1", + "input": [{ + "type": "function_call_output", + "call_id": "call-1", + "output": "ok\n" + }] + }), + )?; + writer.append(RawTraceEventPayload::InferenceStarted { + inference_call_id: "inference-2".to_string(), + thread_id: "thread-root".to_string(), + codex_turn_id: "turn-2".to_string(), + model: "gpt-test".to_string(), + provider_name: "test-provider".to_string(), + request_payload: request, + })?; + Ok(()) +} diff --git a/codex-rs/rollout-trace/src/writer.rs b/codex-rs/rollout-trace/src/writer.rs new file mode 100644 index 000000000000..1676ff2e64eb --- /dev/null +++ b/codex-rs/rollout-trace/src/writer.rs @@ -0,0 +1,264 @@ +//! Hot-path trace bundle writer. + +use std::fs::File; +use std::fs::OpenOptions; +use std::io::BufWriter; +use std::io::Write; +use std::path::Path; +use std::path::PathBuf; +use std::sync::Mutex; +use std::sync::MutexGuard; +use std::sync::PoisonError; +use std::time::SystemTime; +use std::time::UNIX_EPOCH; + +use anyhow::Context; +use anyhow::Result; +use serde::Serialize; + +use crate::bundle::MANIFEST_FILE_NAME; +use crate::bundle::PAYLOADS_DIR_NAME; +use crate::bundle::RAW_EVENT_LOG_FILE_NAME; +use crate::bundle::TraceBundleManifest; +use crate::model::AgentThreadId; +use crate::payload::RawPayloadKind; +use crate::payload::RawPayloadRef; +use crate::raw_event::RAW_TRACE_EVENT_SCHEMA_VERSION; +use crate::raw_event::RawTraceEvent; +use crate::raw_event::RawTraceEventContext; +use crate::raw_event::RawTraceEventPayload; + +/// Local trace bundle writer. +/// +/// The writer appends raw events and writes payload files. It does not keep a +/// reduced `RolloutTrace` in memory; replay is owned by the reducer. +#[derive(Debug)] +pub struct TraceWriter { + inner: Mutex, +} + +#[derive(Debug)] +struct TraceWriterInner { + manifest: TraceBundleManifest, + payloads_dir: PathBuf, + event_log: BufWriter, + next_seq: u64, + next_payload_ordinal: u64, +} + +impl TraceWriter { + /// Creates a trace bundle directory and writes its manifest. + pub fn create( + bundle_dir: impl AsRef, + trace_id: String, + rollout_id: String, + root_thread_id: AgentThreadId, + ) -> Result { + let bundle_dir = bundle_dir.as_ref().to_path_buf(); + let payloads_dir = bundle_dir.join(PAYLOADS_DIR_NAME); + std::fs::create_dir_all(&payloads_dir) + .with_context(|| format!("create trace payload dir {}", payloads_dir.display()))?; + + let started_at_unix_ms = unix_time_ms(); + let manifest = + TraceBundleManifest::new(trace_id, rollout_id, root_thread_id, started_at_unix_ms); + write_json_file(&bundle_dir.join(MANIFEST_FILE_NAME), &manifest)?; + + let event_log_path = bundle_dir.join(RAW_EVENT_LOG_FILE_NAME); + let event_log = OpenOptions::new() + .create(true) + .append(true) + .open(&event_log_path) + .with_context(|| format!("open trace event log {}", event_log_path.display()))?; + + Ok(Self { + inner: Mutex::new(TraceWriterInner { + manifest, + payloads_dir, + event_log: BufWriter::new(event_log), + next_seq: 1, + next_payload_ordinal: 1, + }), + }) + } + + /// Writes a JSON payload file and returns its reduced-state reference. + pub fn write_json_payload( + &self, + kind: RawPayloadKind, + value: &impl Serialize, + ) -> Result { + let mut inner = self.lock_inner(); + let ordinal = inner.next_payload_ordinal; + inner.next_payload_ordinal += 1; + let raw_payload_id = format!("raw_payload:{ordinal}"); + let relative_path = format!("{PAYLOADS_DIR_NAME}/{ordinal}.json"); + let absolute_path = inner.payloads_dir.join(format!("{ordinal}.json")); + // Payload files are created before the event that references them. A + // replay interrupted after an event is appended should never point at a + // payload file that the writer planned but had not written yet. + write_json_file(&absolute_path, value)?; + Ok(RawPayloadRef { + raw_payload_id, + kind, + path: relative_path, + }) + } + + /// Appends one raw event with no extra envelope context. + pub fn append(&self, payload: RawTraceEventPayload) -> Result { + self.append_with_context(RawTraceEventContext::default(), payload) + } + + /// Appends one raw event with explicit thread/turn context. + pub fn append_with_context( + &self, + context: RawTraceEventContext, + payload: RawTraceEventPayload, + ) -> Result { + let mut inner = self.lock_inner(); + let event = RawTraceEvent { + schema_version: RAW_TRACE_EVENT_SCHEMA_VERSION, + seq: inner.next_seq, + wall_time_unix_ms: unix_time_ms(), + rollout_id: inner.manifest.rollout_id.clone(), + thread_id: context.thread_id, + codex_turn_id: context.codex_turn_id, + payload, + }; + inner.next_seq += 1; + serde_json::to_writer(&mut inner.event_log, &event)?; + inner.event_log.write_all(b"\n")?; + inner.event_log.flush()?; + Ok(event) + } + + fn lock_inner(&self) -> MutexGuard<'_, TraceWriterInner> { + // Preserve the event log after a panic in tracing code. Dropping the + // writer would lose subsequent diagnostic events in exactly the session + // we are trying to debug. + self.inner.lock().unwrap_or_else(PoisonError::into_inner) + } +} + +fn write_json_file(path: &Path, value: &impl Serialize) -> Result<()> { + let file = File::create(path).with_context(|| format!("create {}", path.display()))?; + serde_json::to_writer_pretty(file, value) + .with_context(|| format!("write JSON {}", path.display())) +} + +pub(crate) fn unix_time_ms() -> i64 { + let duration = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default(); + i64::try_from(duration.as_millis()).unwrap_or(i64::MAX) +} + +#[cfg(test)] +mod tests { + use pretty_assertions::assert_eq; + use serde_json::json; + use tempfile::TempDir; + + use crate::model::ExecutionStatus; + use crate::model::RolloutStatus; + use crate::payload::RawPayloadKind; + use crate::raw_event::RawTraceEventPayload; + use crate::replay_bundle; + use crate::writer::TraceWriter; + + #[test] + fn writer_records_payload_refs_and_replays_rollout_status() -> anyhow::Result<()> { + let temp = TempDir::new()?; + let writer = TraceWriter::create( + temp.path(), + "trace-1".to_string(), + "rollout-1".to_string(), + "thread-root".to_string(), + )?; + + writer.append(RawTraceEventPayload::RolloutStarted { + trace_id: "trace-1".to_string(), + root_thread_id: "thread-root".to_string(), + })?; + let metadata_payload = writer.write_json_payload( + RawPayloadKind::ProtocolEvent, + &json!({ + "source": "test", + "model": "gpt-test", + }), + )?; + writer.append(RawTraceEventPayload::ThreadStarted { + thread_id: "thread-root".to_string(), + agent_path: "/root".to_string(), + metadata_payload: Some(metadata_payload.clone()), + })?; + writer.append(RawTraceEventPayload::CodexTurnStarted { + codex_turn_id: "turn-1".to_string(), + thread_id: "thread-root".to_string(), + })?; + let inference_request = writer.write_json_payload( + RawPayloadKind::InferenceRequest, + &json!({ + "model": "gpt-test", + "input": [{ + "type": "message", + "role": "user", + "content": [{"type": "input_text", "text": "hello"}] + }], + }), + )?; + writer.append(RawTraceEventPayload::InferenceStarted { + inference_call_id: "inference-1".to_string(), + thread_id: "thread-root".to_string(), + codex_turn_id: "turn-1".to_string(), + model: "gpt-test".to_string(), + provider_name: "test-provider".to_string(), + request_payload: inference_request.clone(), + })?; + let inference_response = writer.write_json_payload( + RawPayloadKind::InferenceResponse, + &json!({ + "response_id": "resp-1", + "output_items": [], + }), + )?; + writer.append(RawTraceEventPayload::InferenceCompleted { + inference_call_id: "inference-1".to_string(), + response_id: Some("resp-1".to_string()), + response_payload: inference_response.clone(), + })?; + writer.append(RawTraceEventPayload::CodexTurnEnded { + codex_turn_id: "turn-1".to_string(), + status: ExecutionStatus::Completed, + })?; + writer.append(RawTraceEventPayload::RolloutEnded { + status: RolloutStatus::Completed, + })?; + + let rollout = replay_bundle(temp.path())?; + + assert_eq!(rollout.status, RolloutStatus::Completed); + assert_eq!(rollout.root_thread_id, "thread-root"); + assert_eq!(rollout.threads["thread-root"].agent_path, "/root"); + assert_eq!(rollout.codex_turns["turn-1"].thread_id, "thread-root"); + assert_eq!( + rollout.codex_turns["turn-1"].execution.status, + ExecutionStatus::Completed, + ); + assert_eq!( + rollout.inference_calls["inference-1"].raw_request_payload_id, + inference_request.raw_payload_id, + ); + assert_eq!( + rollout.inference_calls["inference-1"].raw_response_payload_id, + Some(inference_response.raw_payload_id), + ); + assert_eq!( + rollout.raw_payloads[&metadata_payload.raw_payload_id].path, + "payloads/1.json" + ); + + Ok(()) + } +}