From abeef2887511a01f80f368530d30b0e00df97e4a Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Thu, 16 Apr 2026 22:27:49 +0000 Subject: [PATCH 01/47] agent-*-advanced: wire up durable-resume branch of databricks-ai-bridge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pins both advanced templates to the ai-bridge PR branch so the long-running agent server crash-resumes in-flight runs via heartbeat + CAS claim. Revert the [tool.uv.sources] entry once that PR merges and a new release is cut. Also fixes a latent IndexError in agent-openai-advanced's deduplicate_input: when the long-running server re-invokes the handler with input=[] to resume from the session (the agnostic resume contract validated by prototyping), messages[-1] blew up. Now we return [] for empty input — the session already has prior turns so there is nothing to dedupe. No change to either template's agent.py. --- agent-langgraph-advanced/pyproject.toml | 5 +++++ agent-openai-advanced/agent_server/utils.py | 5 +++++ agent-openai-advanced/pyproject.toml | 5 +++++ 3 files changed, 15 insertions(+) diff --git a/agent-langgraph-advanced/pyproject.toml b/agent-langgraph-advanced/pyproject.toml index 203f42a5..a4f66381 100644 --- a/agent-langgraph-advanced/pyproject.toml +++ b/agent-langgraph-advanced/pyproject.toml @@ -41,6 +41,11 @@ setup = [ [tool.uv] default-groups = ["dev", "setup"] +# TEMPORARY: point at the open PR branch while ML-64230 durable-resume +# changes are in review. Revert to the registry release once merged. +[tool.uv.sources] +databricks-ai-bridge = { git = "https://github.com/databricks/databricks-ai-bridge", branch = "dhruv0811/durable-execution-resume" } + [tool.pytest.ini_options] base_url = "http://localhost:8000" diff --git a/agent-openai-advanced/agent_server/utils.py b/agent-openai-advanced/agent_server/utils.py index 0c205afc..13b90cbb 100644 --- a/agent-openai-advanced/agent_server/utils.py +++ b/agent-openai-advanced/agent_server/utils.py @@ -159,6 +159,11 @@ async def deduplicate_input(request: ResponsesAgentRequest, session: AsyncDatabr since the session will prepend the full history automatically. """ messages = [i.model_dump() for i in request.input] + # Empty input is a valid signal from the long-running server to resume an + # existing session without re-sending user content — the session already + # has the prior turns, so there is nothing to deduplicate. + if not messages: + return [] # Normalize assistant message content from string to structured list format. # MLflow evaluation sends assistant content as a plain string, but the OpenAI # Agents SDK expects it as [{"type": "output_text", "text": ..., "annotations": []}]. diff --git a/agent-openai-advanced/pyproject.toml b/agent-openai-advanced/pyproject.toml index 783adab9..95f01416 100644 --- a/agent-openai-advanced/pyproject.toml +++ b/agent-openai-advanced/pyproject.toml @@ -42,6 +42,11 @@ setup = [ prerelease = "allow" default-groups = ["dev", "setup"] +# TEMPORARY: point at the open PR branch while ML-64230 durable-resume +# changes are in review. Revert to the registry release once merged. +[tool.uv.sources] +databricks-ai-bridge = { git = "https://github.com/databricks/databricks-ai-bridge", branch = "dhruv0811/durable-execution-resume" } + [tool.pytest.ini_options] base_url = "http://localhost:8000" addopts = "-n 7" From 83a8e7e8e6c5d77657e55dc70c4cc56b5e2d90ca Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Mon, 20 Apr 2026 20:40:21 +0000 Subject: [PATCH 02/47] Wire UI through LongRunningAgentServer's background+resume contract Makes the bundled chat UI durable end-to-end without any client-side changes. The Express /invocations proxy in e2e-chatbot-app-next now: - Rewrites streaming POSTs to { ...body, background: true, stream: true }, so every user turn persists each SSE event to Lakebase via LongRunningAgentServer. - Sniffs response.id + sequence_number out of the forwarded SSE stream. - If upstream closes before [DONE] (pod died, lost connection), the proxy transparently reconnects via GET /responses/{id}?stream=true&starting_after=N and resumes emitting events to the still-connected browser client. The browser sees one continuous stream. Non-streaming requests and non-POST methods keep the original passthrough behavior. Also points agent-openai-advanced/scripts/start_app.py at the dhruv0811/durable-execution-templates branch of app-templates so the new proxy code is actually deployed (override via APP_TEMPLATES_BRANCH env var). Revert once this lands on main. --- agent-openai-advanced/databricks.yml | 45 ++---- agent-openai-advanced/scripts/start_app.py | 17 ++- e2e-chatbot-app-next/server/src/index.ts | 170 ++++++++++++++++++--- 3 files changed, 178 insertions(+), 54 deletions(-) diff --git a/agent-openai-advanced/databricks.yml b/agent-openai-advanced/databricks.yml index 0e8eef4d..70e98095 100644 --- a/agent-openai-advanced/databricks.yml +++ b/agent-openai-advanced/databricks.yml @@ -4,12 +4,12 @@ bundle: resources: apps: agent_openai_advanced: - name: "agent-openai-advanced" + name: "dhruv-agent-openai-adv-durable" description: "OpenAI Agents SDK agent with short-term memory and long-running background task support" source_code_path: ./ config: - command: ["uv", "run", "start-app"] + command: ["uv", "run", "start-app", "--no-ui"] env: - name: MLFLOW_TRACKING_URI value: "databricks" @@ -23,50 +23,35 @@ resources: value: "300" - name: MLFLOW_EXPERIMENT_ID value_from: "experiment" - - name: LAKEBASE_AUTOSCALING_ENDPOINT - value_from: "postgres" - # Autoscaling Lakebase config - - name: LAKEBASE_AUTOSCALING_PROJECT - value: "" - - name: LAKEBASE_AUTOSCALING_BRANCH - value: "" + - name: LAKEBASE_INSTANCE_NAME + value_from: "database" - name: LOG_LEVEL value: "INFO" - # Use for provisioned lakebase resource - # - name: LAKEBASE_INSTANCE_NAME - # value: "" + # Temporary: exposes /_debug/kill_task/{id} for integration tests + # that simulate pod crashes without restarting the app. Remove + # before any production use. + - name: LONG_RUNNING_ENABLE_DEBUG_KILL + value: "1" # Resources which this app has access to resources: - name: 'experiment' experiment: - experiment_id: "" + experiment_id: "1490685316442238" permission: 'CAN_MANAGE' - # Autoscaling postgres resource - # See: .claude/skills/add-tools/examples/lakebase-autoscaling.yaml - - name: 'postgres' - postgres: - branch: "projects//branches/" - database: "projects//branches//databases/" + - name: 'database' + database: + instance_name: 'dhruv-gupta' + database_name: 'databricks_postgres' permission: 'CAN_CONNECT_AND_CREATE' - # Use for provisioned lakebase resource - # - name: 'database' - # database: - # instance_name: '' - # database_name: 'databricks_postgres' - # permission: 'CAN_CONNECT_AND_CREATE' targets: dev: mode: development default: true - # workspace: - # host: https://... prod: mode: production - # workspace: - # host: https://... resources: apps: agent_openai_advanced: - name: agent-openai-adv + name: dhruv-agent-openai-adv-durable diff --git a/agent-openai-advanced/scripts/start_app.py b/agent-openai-advanced/scripts/start_app.py index 557fe4e7..b95ba1df 100644 --- a/agent-openai-advanced/scripts/start_app.py +++ b/agent-openai-advanced/scripts/start_app.py @@ -139,14 +139,27 @@ def clone_frontend_if_needed(self): if Path("e2e-chatbot-app-next").exists(): return True - print("Cloning e2e-chatbot-app-next...") + # TEMPORARY: checkout the dhruv0811/durable-execution-templates branch + # so the deployed frontend includes the background-mode /invocations + # proxy with auto-resume. Revert to cloning main after that PR merges. + branch = os.getenv("APP_TEMPLATES_BRANCH", "dhruv0811/durable-execution-templates") + print(f"Cloning e2e-chatbot-app-next from branch '{branch}'...") for url in [ "https://github.com/databricks/app-templates.git", "git@github.com:databricks/app-templates.git", ]: try: subprocess.run( - ["git", "clone", "--filter=blob:none", "--sparse", url, "temp-app-templates"], + [ + "git", + "clone", + "--filter=blob:none", + "--sparse", + "--branch", + branch, + url, + "temp-app-templates", + ], check=True, capture_output=True, ) diff --git a/e2e-chatbot-app-next/server/src/index.ts b/e2e-chatbot-app-next/server/src/index.ts index 132d80d4..cb1ecdef 100644 --- a/e2e-chatbot-app-next/server/src/index.ts +++ b/e2e-chatbot-app-next/server/src/index.ts @@ -59,46 +59,172 @@ app.use('/api/config', configRouter); app.use('/api/feedback', feedbackRouter); // Agent backend proxy (optional) -// If API_PROXY is set, proxy /invocations requests to the agent backend +// If API_PROXY is set, proxy /invocations requests to the agent backend. +// For streaming POSTs we rewrite into LongRunningAgentServer's "background" +// contract: the backend persists every event to Lakebase, the proxy auto- +// resumes via GET /responses/{id}?stream=true&starting_after=N if the +// upstream connection dies before the [DONE] sentinel. This is what makes +// the UI survive mid-response pod crashes — zero client-side changes. const agentBackendUrl = process.env.API_PROXY; if (agentBackendUrl) { console.log(`✅ Proxying /invocations to ${agentBackendUrl}`); + + // Derive the retrieve endpoint (strip trailing /invocations or /responses) + const backendRoot = agentBackendUrl.replace(/\/(invocations|responses)\/?$/, ''); + const retrieveUrl = (rid: string, startingAfter: number) => + `${backendRoot}/responses/${rid}?stream=true&starting_after=${startingAfter}`; + app.all('/invocations', async (req: Request, res: Response) => { try { const forwardHeaders = { ...req.headers } as Record; - forwardHeaders['content-length'] = undefined; + // biome-ignore lint/performance/noDelete: fetch rejects empty content-length + delete forwardHeaders['content-length']; - const response = await fetch(agentBackendUrl, { - method: req.method, - headers: forwardHeaders, - body: - req.method !== 'GET' && req.method !== 'HEAD' - ? JSON.stringify(req.body) - : undefined, - }); + const isStreamingPost = + req.method === 'POST' && + req.body && + typeof req.body === 'object' && + (req.body.stream === true || req.body.stream === 'true'); - // Copy status and headers - res.status(response.status); - response.headers.forEach((value, key) => { - res.setHeader(key, value); - }); + // Non-streaming or non-POST: original passthrough behavior. + if (!isStreamingPost) { + const response = await fetch(agentBackendUrl, { + method: req.method, + headers: forwardHeaders, + body: + req.method !== 'GET' && req.method !== 'HEAD' + ? JSON.stringify(req.body) + : undefined, + }); + res.status(response.status); + response.headers.forEach((value, key) => res.setHeader(key, value)); + if (response.body) { + const reader = response.body.getReader(); + while (true) { + const { done, value } = await reader.read(); + if (done) break; + res.write(value); + } + } + res.end(); + return; + } + + // Streaming POST → background mode with auto-resume. + const durableBody = { + ...req.body, + background: true, + stream: true, + }; - // Stream the response body - if (response.body) { - const reader = response.body.getReader(); + // Prime SSE headers immediately so the client starts reading even if the + // first upstream chunk takes a moment. + res.status(200); + res.setHeader('content-type', 'text/event-stream'); + res.setHeader('cache-control', 'no-cache'); + res.setHeader('connection', 'keep-alive'); + res.flushHeaders?.(); + + let responseId: string | null = null; + let lastSeq = 0; + let sawDone = false; + // Safety cap so a permanently-broken backend can't loop forever. + const MAX_RESUME_ATTEMPTS = 10; + let resumeAttempt = 0; + + // Read one SSE stream to completion, writing every chunk to the client + // and tracking response_id + sequence_number from each event. Returns + // whether we saw the [DONE] sentinel. + const pumpStream = async (upstream: globalThis.Response) => { + if (!upstream.body) return false; + const reader = upstream.body.getReader(); + const decoder = new TextDecoder(); + let buf = ''; while (true) { const { done, value } = await reader.read(); if (done) break; res.write(value); + buf += decoder.decode(value, { stream: true }); + // Pull out complete SSE frames (separated by \n\n) to sniff metadata. + const frames = buf.split(/\n\n/); + buf = frames.pop() || ''; + for (const frame of frames) { + if (frame.includes('data: [DONE]')) { + return true; + } + const dataLine = frame.split('\n').find((l) => l.startsWith('data:')); + if (!dataLine) continue; + try { + const parsed = JSON.parse(dataLine.slice(5).trim()); + if (!responseId && typeof parsed.id === 'string') { + responseId = parsed.id; + } + if ( + typeof parsed.sequence_number === 'number' && + parsed.sequence_number > lastSeq + ) { + lastSeq = parsed.sequence_number; + } + } catch { + // Non-JSON SSE frame (e.g. heartbeats) — safe to ignore. + } + } } + return false; + }; + + // Kickoff: POST background request. + const initial = await fetch(agentBackendUrl, { + method: 'POST', + headers: forwardHeaders, + body: JSON.stringify(durableBody), + }); + if (!initial.ok) { + const text = await initial.text(); + res.write( + `event: error\ndata: ${JSON.stringify({ error: { message: text, status: initial.status } })}\n\n`, + ); + res.end(); + return; } + sawDone = await pumpStream(initial); + + // Auto-resume loop: if upstream closed early (pod crash) and we know a + // response_id, reconnect via the retrieve endpoint using our cursor. + while (!sawDone && responseId && resumeAttempt < MAX_RESUME_ATTEMPTS) { + resumeAttempt += 1; + console.log( + `[/invocations] resuming stream for ${responseId} from seq=${lastSeq} (attempt ${resumeAttempt})`, + ); + const resumed = await fetch(retrieveUrl(responseId, lastSeq), { + method: 'GET', + headers: forwardHeaders, + }); + if (!resumed.ok) { + res.write( + `event: error\ndata: ${JSON.stringify({ error: { message: 'Resume fetch failed', status: resumed.status } })}\n\n`, + ); + break; + } + sawDone = await pumpStream(resumed); + } + res.end(); } catch (error) { console.error('[/invocations proxy] Error:', error); - res.status(502).json({ - error: 'Proxy error', - message: error instanceof Error ? error.message : String(error), - }); + if (!res.headersSent) { + res.status(502).json({ + error: 'Proxy error', + message: error instanceof Error ? error.message : String(error), + }); + } else { + try { + res.write( + `event: error\ndata: ${JSON.stringify({ error: { message: error instanceof Error ? error.message : String(error) } })}\n\n`, + ); + res.end(); + } catch {} + } } }); } From fa550afd760c425c18b52d612cbc7ed8bf7d694d Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Mon, 20 Apr 2026 21:02:47 +0000 Subject: [PATCH 03/47] Route AI SDK through Express /invocations proxy so background-rewrite actually fires Previous attempt left the proxy dead-code: the Node AI SDK honored API_PROXY verbatim and sent requests straight to http://localhost:8000/invocations (FastAPI), skipping the Express /invocations handler at :3000 entirely. Confirmed in logs: requests reached the backend with {"stream": true} but never with "background": true. Split the two concerns across env vars: API_PROXY=http://localhost:3000/invocations (AI SDK -> Express proxy) AGENT_BACKEND_URL=http://localhost:8000/invocations (Express proxy -> FastAPI) Express handler prefers AGENT_BACKEND_URL, falls back to API_PROXY for backwards compat so existing templates don't break. --- agent-openai-advanced/databricks.yml | 9 ++++++++- e2e-chatbot-app-next/server/src/index.ts | 21 ++++++++++++++------- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/agent-openai-advanced/databricks.yml b/agent-openai-advanced/databricks.yml index 70e98095..fcd9122b 100644 --- a/agent-openai-advanced/databricks.yml +++ b/agent-openai-advanced/databricks.yml @@ -9,13 +9,20 @@ resources: background task support" source_code_path: ./ config: - command: ["uv", "run", "start-app", "--no-ui"] + command: ["uv", "run", "start-app"] env: - name: MLFLOW_TRACKING_URI value: "databricks" - name: MLFLOW_REGISTRY_URI value: "databricks-uc" + # API_PROXY points the Node AI SDK at the Express /invocations + # handler (on the same Node process, port 3000), which rewrites the + # request into background mode and auto-resumes on disconnect. + # AGENT_BACKEND_URL is where that handler forwards — the real + # Python FastAPI agent endpoint. - name: API_PROXY + value: "http://localhost:3000/invocations" + - name: AGENT_BACKEND_URL value: "http://localhost:8000/invocations" - name: CHAT_APP_PORT value: "3000" diff --git a/e2e-chatbot-app-next/server/src/index.ts b/e2e-chatbot-app-next/server/src/index.ts index cb1ecdef..3026572e 100644 --- a/e2e-chatbot-app-next/server/src/index.ts +++ b/e2e-chatbot-app-next/server/src/index.ts @@ -59,13 +59,20 @@ app.use('/api/config', configRouter); app.use('/api/feedback', feedbackRouter); // Agent backend proxy (optional) -// If API_PROXY is set, proxy /invocations requests to the agent backend. -// For streaming POSTs we rewrite into LongRunningAgentServer's "background" -// contract: the backend persists every event to Lakebase, the proxy auto- -// resumes via GET /responses/{id}?stream=true&starting_after=N if the -// upstream connection dies before the [DONE] sentinel. This is what makes -// the UI survive mid-response pod crashes — zero client-side changes. -const agentBackendUrl = process.env.API_PROXY; +// If AGENT_BACKEND_URL (or legacy API_PROXY) is set, proxy /invocations +// requests to the agent backend. For streaming POSTs we rewrite into +// LongRunningAgentServer's "background" contract: the backend persists every +// event to Lakebase, the proxy auto-resumes via +// GET /responses/{id}?stream=true&starting_after=N +// if the upstream connection dies before the [DONE] sentinel. This is what +// makes the UI survive mid-response pod crashes — zero client-side changes. +// +// IMPORTANT: when running with the Python FastAPI backend, point +// AGENT_BACKEND_URL at FastAPI (e.g. http://localhost:8000/invocations) and +// set API_PROXY at THIS Express server (e.g. http://localhost:3000/invocations) +// so the AI SDK provider in providers-server.ts routes through this handler +// instead of going direct to FastAPI. +const agentBackendUrl = process.env.AGENT_BACKEND_URL || process.env.API_PROXY; if (agentBackendUrl) { console.log(`✅ Proxying /invocations to ${agentBackendUrl}`); From 911f18b61ab92f3d2da7bafe8132855da107d6c6 Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Mon, 20 Apr 2026 21:08:26 +0000 Subject: [PATCH 04/47] Log response_id on first SSE event from Express proxy response_id is buried in the raw backend SSE stream and never surfaces to the browser because the Vercel AI SDK re-wraps the stream as its own message format before sending to the client. Log it on the server side instead so test instructions can `grep 'background started response_id=' ` from apps logs. Also distinguish the startup log so it's clear the durable-resume code path is live. No behavior change; pure observability. --- e2e-chatbot-app-next/server/src/index.ts | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/e2e-chatbot-app-next/server/src/index.ts b/e2e-chatbot-app-next/server/src/index.ts index 3026572e..b9d584ac 100644 --- a/e2e-chatbot-app-next/server/src/index.ts +++ b/e2e-chatbot-app-next/server/src/index.ts @@ -74,7 +74,9 @@ app.use('/api/feedback', feedbackRouter); // instead of going direct to FastAPI. const agentBackendUrl = process.env.AGENT_BACKEND_URL || process.env.API_PROXY; if (agentBackendUrl) { - console.log(`✅ Proxying /invocations to ${agentBackendUrl}`); + console.log( + `✅ Proxying /invocations to ${agentBackendUrl} (durable-resume enabled)`, + ); // Derive the retrieve endpoint (strip trailing /invocations or /responses) const backendRoot = agentBackendUrl.replace(/\/(invocations|responses)\/?$/, ''); @@ -135,6 +137,9 @@ if (agentBackendUrl) { let responseId: string | null = null; let lastSeq = 0; let sawDone = false; + const onFirstResponseId = (rid: string) => { + console.log(`[/invocations] background started response_id=${rid}`); + }; // Safety cap so a permanently-broken backend can't loop forever. const MAX_RESUME_ATTEMPTS = 10; let resumeAttempt = 0; @@ -165,6 +170,7 @@ if (agentBackendUrl) { const parsed = JSON.parse(dataLine.slice(5).trim()); if (!responseId && typeof parsed.id === 'string') { responseId = parsed.id; + onFirstResponseId(responseId); } if ( typeof parsed.sequence_number === 'number' && From 399ffdea3b760f5abd09a6d075904a39df32cd19 Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Mon, 20 Apr 2026 21:14:59 +0000 Subject: [PATCH 05/47] Match API_PROXY + AGENT_BACKEND_URL in app.yaml too app.yaml env vars were overriding databricks.yml at runtime, so the AI SDK was still talking directly to the Python FastAPI backend and the Express /invocations proxy never saw the request. Keep both files in sync. --- agent-openai-advanced/app.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/agent-openai-advanced/app.yaml b/agent-openai-advanced/app.yaml index 1e406cb4..fab84034 100644 --- a/agent-openai-advanced/app.yaml +++ b/agent-openai-advanced/app.yaml @@ -6,7 +6,12 @@ env: value: "databricks" - name: MLFLOW_REGISTRY_URI value: "databricks-uc" + # API_PROXY points the Node AI SDK at the Express /invocations handler + # (port 3000), which rewrites to background mode and auto-resumes on + # disconnect. AGENT_BACKEND_URL is where that handler forwards. - name: API_PROXY + value: "http://localhost:3000/invocations" + - name: AGENT_BACKEND_URL value: "http://localhost:8000/invocations" - name: CHAT_APP_PORT value: "3000" From 04e6b1bf29a977d93e6759c2aebdeed791327617 Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Mon, 20 Apr 2026 21:18:56 +0000 Subject: [PATCH 06/47] start_app: point API_PROXY at the Express proxy, keep AGENT_BACKEND_URL to FastAPI The script was unconditionally overwriting API_PROXY with the backend URL right before launching the frontend, which defeated our whole durable- resume-rewrite story: the Node AI SDK bypassed the Express /invocations handler and streamed straight from FastAPI. Fix: API_PROXY now points at CHAT_APP_PORT (the Express proxy), and we default AGENT_BACKEND_URL (previously unset) to the Python backend. Use os.environ.setdefault for AGENT_BACKEND_URL so operators can still override via databricks.yml or app.yaml. --- agent-openai-advanced/scripts/start_app.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/agent-openai-advanced/scripts/start_app.py b/agent-openai-advanced/scripts/start_app.py index b95ba1df..93553d96 100644 --- a/agent-openai-advanced/scripts/start_app.py +++ b/agent-openai-advanced/scripts/start_app.py @@ -232,8 +232,22 @@ def run(self, backend_args=None): print("WARNING: Failed to clone frontend. Continuing with backend only.") self.no_ui = True else: - # Set API_PROXY environment variable for frontend to connect to backend - os.environ["API_PROXY"] = f"http://localhost:{self.port}/invocations" + # Point the Node AI SDK at the Express /invocations handler + # (same Node process, port CHAT_APP_PORT) so streaming POSTs + # go through the background-mode rewrite + auto-resume proxy. + # The proxy forwards to AGENT_BACKEND_URL (the Python backend + # on self.port). Respect any externally-provided values so + # operators can override per-deployment. + frontend_port = int( + os.environ.get("CHAT_APP_PORT", os.environ.get("PORT", "3000")) + ) + os.environ.setdefault( + "AGENT_BACKEND_URL", + f"http://localhost:{self.port}/invocations", + ) + os.environ["API_PROXY"] = ( + f"http://localhost:{frontend_port}/invocations" + ) # Open log files self.backend_log = open("backend.log", "w", buffering=1) From 948f7b4c502b095e2943d249ccc1c6a92876fdf9 Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Mon, 20 Apr 2026 21:22:27 +0000 Subject: [PATCH 07/47] Proxy: accept response_id from top-level, nested response.id, or id= resp_* Broadens the response_id parser so it works whether the backend tags frames with top-level response_id (preferred) or the older nested-only shape. --- e2e-chatbot-app-next/server/src/index.ts | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/e2e-chatbot-app-next/server/src/index.ts b/e2e-chatbot-app-next/server/src/index.ts index b9d584ac..e1ec134e 100644 --- a/e2e-chatbot-app-next/server/src/index.ts +++ b/e2e-chatbot-app-next/server/src/index.ts @@ -168,8 +168,17 @@ if (agentBackendUrl) { if (!dataLine) continue; try { const parsed = JSON.parse(dataLine.slice(5).trim()); - if (!responseId && typeof parsed.id === 'string') { - responseId = parsed.id; + // Accept response_id from the dedicated top-level tag, or fall + // back to the response.id / top-level id shapes so this works + // across LongRunningAgentServer versions. + const rid = + parsed.response_id ?? + parsed.response?.id ?? + (typeof parsed.id === 'string' && parsed.id.startsWith('resp_') + ? parsed.id + : null); + if (!responseId && typeof rid === 'string') { + responseId = rid; onFirstResponseId(responseId); } if ( From cea05080597d6fbcb9e16f3a99dbf995ba0cc42c Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Mon, 20 Apr 2026 21:32:33 +0000 Subject: [PATCH 08/47] Proxy: log upstream close + each resume-fetch attempt + final stream tally Matches the [/invocations] prefix so the full story is greppable from apps logs without correlating Node and Python timestamps. --- e2e-chatbot-app-next/server/src/index.ts | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/e2e-chatbot-app-next/server/src/index.ts b/e2e-chatbot-app-next/server/src/index.ts index e1ec134e..c63f0883 100644 --- a/e2e-chatbot-app-next/server/src/index.ts +++ b/e2e-chatbot-app-next/server/src/index.ts @@ -213,24 +213,42 @@ if (agentBackendUrl) { // Auto-resume loop: if upstream closed early (pod crash) and we know a // response_id, reconnect via the retrieve endpoint using our cursor. + if (!sawDone && responseId) { + console.log( + `[/invocations] upstream closed without [DONE] response_id=${responseId} last_seq=${lastSeq}; entering auto-resume`, + ); + } while (!sawDone && responseId && resumeAttempt < MAX_RESUME_ATTEMPTS) { resumeAttempt += 1; console.log( - `[/invocations] resuming stream for ${responseId} from seq=${lastSeq} (attempt ${resumeAttempt})`, + `[/invocations] resume fetch response_id=${responseId} starting_after=${lastSeq} attempt=${resumeAttempt}`, ); const resumed = await fetch(retrieveUrl(responseId, lastSeq), { method: 'GET', headers: forwardHeaders, }); if (!resumed.ok) { + console.log( + `[/invocations] resume failed response_id=${responseId} status=${resumed.status}`, + ); res.write( `event: error\ndata: ${JSON.stringify({ error: { message: 'Resume fetch failed', status: resumed.status } })}\n\n`, ); break; } sawDone = await pumpStream(resumed); + if (sawDone) { + console.log( + `[/invocations] resume succeeded response_id=${responseId} after ${resumeAttempt} attempts`, + ); + } } + if (responseId) { + console.log( + `[/invocations] stream done response_id=${responseId} saw_done=${sawDone} last_seq=${lastSeq} resumes=${resumeAttempt}`, + ); + } res.end(); } catch (error) { console.error('[/invocations proxy] Error:', error); From cafd07dc50c2f7cbfddb0a3367aba9a29f5535ed Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Mon, 20 Apr 2026 21:35:40 +0000 Subject: [PATCH 09/47] Surface databricks_ai_bridge [durable] INFO logs into apps output The library logger inherits from root (default WARNING) so INFO-level lifecycle messages from LongRunningAgentServer (heartbeat, claim, resume, stream lifecycle) were being dropped. Set both the ai-bridge logger and the root level to LOG_LEVEL so apps logs carry the full durable-resume story without requiring callers to tune logging themselves. --- agent-openai-advanced/agent_server/start_server.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/agent-openai-advanced/agent_server/start_server.py b/agent-openai-advanced/agent_server/start_server.py index eb5bfbfb..0df0b458 100644 --- a/agent-openai-advanced/agent_server/start_server.py +++ b/agent-openai-advanced/agent_server/start_server.py @@ -56,7 +56,18 @@ def transform_stream_event(self, event, response_id): ) log_level = os.getenv("LOG_LEVEL", "INFO") -logging.getLogger("agent_server").setLevel(getattr(logging, log_level.upper(), logging.INFO)) +_lvl = getattr(logging, log_level.upper(), logging.INFO) +logging.getLogger("agent_server").setLevel(_lvl) +# Surface [durable] lifecycle logs from LongRunningAgentServer into apps logs. +# These are INFO-level in databricks_ai_bridge but the library logger defaults +# to WARNING unless the host process sets it explicitly. +logging.getLogger("databricks_ai_bridge").setLevel(_lvl) +# Ensure the root handler actually emits at this level too. uvicorn sets up +# its own handlers for 'uvicorn.*' but leaves root untouched. +if not logging.getLogger().handlers: + logging.basicConfig(level=_lvl, format="%(asctime)s %(levelname)s %(name)s: %(message)s") +else: + logging.getLogger().setLevel(_lvl) # Wrap the existing lifespan to ensure session tables are created before serving requests _original_lifespan = agent_server.app.router.lifespan_context From 4cbc677b8bbd86edce77b9603198b347a5d3ab75 Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Mon, 20 Apr 2026 22:26:32 +0000 Subject: [PATCH 10/47] Two-bubble UX on resume + match durable-resume wiring across templates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a response is killed mid-stream, the partial assistant text that was already rendered to the client kept receiving fresh deltas from attempt 2 — users saw attempt-1-partial + attempt-2-full concatenated in one bubble. Express /invocations proxy now seals the in-progress assistant message across an attempt boundary: 1. On upstream close without [DONE], immediately append a '(connection interrupted — reconnecting…)' suffix delta to the active message so the user sees something is happening during the ~10s stale window. 2. On the response.resumed sentinel, emit synthetic response.content_part.done + response.output_item.done events for the active message — effectively ending the first assistant bubble at OpenAI Responses API level. 3. Attempt 2's natural response.output_item.added (with a fresh item_id) then creates a clean second bubble showing the full answer. Tool calls naturally de-dup by call_id across attempts, so no closure synthesis needed for them. Also mirrors the routing + logging fixes previously applied to agent-openai-advanced onto agent-langgraph-advanced so both templates get durable resume with the full [durable] log lifecycle visible: - app.yaml + databricks.yml: split API_PROXY (-> Express :3000) from AGENT_BACKEND_URL (-> FastAPI :8000). - scripts/start_app.py: honor AGENT_BACKEND_URL, point API_PROXY at the Express proxy, clone e2e-chatbot-app-next from the durable-execution branch. - agent_server/start_server.py: raise databricks_ai_bridge + root logger to LOG_LEVEL so [durable] INFO lines surface in apps logs. --- .../agent_server/start_server.py | 10 + agent-langgraph-advanced/app.yaml | 5 + agent-langgraph-advanced/databricks.yml | 5 + agent-langgraph-advanced/scripts/start_app.py | 35 +++- e2e-chatbot-app-next/server/src/index.ts | 194 +++++++++++++++--- 5 files changed, 218 insertions(+), 31 deletions(-) diff --git a/agent-langgraph-advanced/agent_server/start_server.py b/agent-langgraph-advanced/agent_server/start_server.py index 4886afa3..a8cd2b02 100644 --- a/agent-langgraph-advanced/agent_server/start_server.py +++ b/agent-langgraph-advanced/agent_server/start_server.py @@ -41,6 +41,16 @@ def transform_stream_event(self, event, response_id): poll_interval_seconds=float(os.getenv("POLL_INTERVAL_SECONDS", "1.0")), ) +log_level = os.getenv("LOG_LEVEL", "INFO") +_lvl = getattr(logging, log_level.upper(), logging.INFO) +logging.getLogger("agent_server").setLevel(_lvl) +# Surface [durable] lifecycle logs from LongRunningAgentServer into apps logs. +logging.getLogger("databricks_ai_bridge").setLevel(_lvl) +if not logging.getLogger().handlers: + logging.basicConfig(level=_lvl, format="%(asctime)s %(levelname)s %(name)s: %(message)s") +else: + logging.getLogger().setLevel(_lvl) + # Define the app as a module level variable to enable multiple workers app = agent_server.app # noqa: F841 setup_mlflow_git_based_version_tracking() diff --git a/agent-langgraph-advanced/app.yaml b/agent-langgraph-advanced/app.yaml index 1e406cb4..aac3edf7 100644 --- a/agent-langgraph-advanced/app.yaml +++ b/agent-langgraph-advanced/app.yaml @@ -6,7 +6,12 @@ env: value: "databricks" - name: MLFLOW_REGISTRY_URI value: "databricks-uc" + # API_PROXY points the Node AI SDK at the Express /invocations handler + # (port 3000) which rewrites to background mode and auto-resumes on + # disconnect. AGENT_BACKEND_URL is where that handler forwards. - name: API_PROXY + value: "http://localhost:3000/invocations" + - name: AGENT_BACKEND_URL value: "http://localhost:8000/invocations" - name: CHAT_APP_PORT value: "3000" diff --git a/agent-langgraph-advanced/databricks.yml b/agent-langgraph-advanced/databricks.yml index 4e562209..db68a651 100644 --- a/agent-langgraph-advanced/databricks.yml +++ b/agent-langgraph-advanced/databricks.yml @@ -14,7 +14,12 @@ resources: value: "databricks" - name: MLFLOW_REGISTRY_URI value: "databricks-uc" + # API_PROXY points the Node AI SDK at the Express /invocations + # handler which rewrites to background mode and auto-resumes on + # disconnect. AGENT_BACKEND_URL is where that handler forwards. - name: API_PROXY + value: "http://localhost:3000/invocations" + - name: AGENT_BACKEND_URL value: "http://localhost:8000/invocations" - name: CHAT_APP_PORT value: "3000" diff --git a/agent-langgraph-advanced/scripts/start_app.py b/agent-langgraph-advanced/scripts/start_app.py index 557fe4e7..93553d96 100644 --- a/agent-langgraph-advanced/scripts/start_app.py +++ b/agent-langgraph-advanced/scripts/start_app.py @@ -139,14 +139,27 @@ def clone_frontend_if_needed(self): if Path("e2e-chatbot-app-next").exists(): return True - print("Cloning e2e-chatbot-app-next...") + # TEMPORARY: checkout the dhruv0811/durable-execution-templates branch + # so the deployed frontend includes the background-mode /invocations + # proxy with auto-resume. Revert to cloning main after that PR merges. + branch = os.getenv("APP_TEMPLATES_BRANCH", "dhruv0811/durable-execution-templates") + print(f"Cloning e2e-chatbot-app-next from branch '{branch}'...") for url in [ "https://github.com/databricks/app-templates.git", "git@github.com:databricks/app-templates.git", ]: try: subprocess.run( - ["git", "clone", "--filter=blob:none", "--sparse", url, "temp-app-templates"], + [ + "git", + "clone", + "--filter=blob:none", + "--sparse", + "--branch", + branch, + url, + "temp-app-templates", + ], check=True, capture_output=True, ) @@ -219,8 +232,22 @@ def run(self, backend_args=None): print("WARNING: Failed to clone frontend. Continuing with backend only.") self.no_ui = True else: - # Set API_PROXY environment variable for frontend to connect to backend - os.environ["API_PROXY"] = f"http://localhost:{self.port}/invocations" + # Point the Node AI SDK at the Express /invocations handler + # (same Node process, port CHAT_APP_PORT) so streaming POSTs + # go through the background-mode rewrite + auto-resume proxy. + # The proxy forwards to AGENT_BACKEND_URL (the Python backend + # on self.port). Respect any externally-provided values so + # operators can override per-deployment. + frontend_port = int( + os.environ.get("CHAT_APP_PORT", os.environ.get("PORT", "3000")) + ) + os.environ.setdefault( + "AGENT_BACKEND_URL", + f"http://localhost:{self.port}/invocations", + ) + os.environ["API_PROXY"] = ( + f"http://localhost:{frontend_port}/invocations" + ) # Open log files self.backend_log = open("backend.log", "w", buffering=1) diff --git a/e2e-chatbot-app-next/server/src/index.ts b/e2e-chatbot-app-next/server/src/index.ts index c63f0883..4385148e 100644 --- a/e2e-chatbot-app-next/server/src/index.ts +++ b/e2e-chatbot-app-next/server/src/index.ts @@ -144,9 +144,72 @@ if (agentBackendUrl) { const MAX_RESUME_ATTEMPTS = 10; let resumeAttempt = 0; - // Read one SSE stream to completion, writing every chunk to the client - // and tracking response_id + sequence_number from each event. Returns - // whether we saw the [DONE] sentinel. + // Tracks the in-progress assistant message item across the stream so we + // can emit synthetic closure events on resume. Without this, attempt 1's + // partial text bubble keeps receiving deltas from attempt 2's fresh + // generation — the client sees attempt-1-partial + attempt-2-full + // concatenated. Sealing the partial turns it into its own bubble. + type ActiveMessage = { + itemId: string; + outputIndex: number; + contentIndex: number; + text: string; + }; + let activeMessage: ActiveMessage | null = null; + + // Emit the closure events that finalize activeMessage with the text + // we've accumulated so far plus a short 'resuming…' suffix. Tunes + // OpenAI Responses API semantics enough for the Vercel AI SDK to + // treat it as a completed message — the next output_item.added in + // attempt 2 then starts a fresh assistant bubble. + const sealActiveMessage = (suffix: string) => { + if (!activeMessage) return; + const finalText = activeMessage.text + suffix; + if (suffix) { + res.write( + `event: response.output_text.delta\ndata: ${JSON.stringify({ + type: 'response.output_text.delta', + item_id: activeMessage.itemId, + output_index: activeMessage.outputIndex, + content_index: activeMessage.contentIndex, + delta: suffix, + })}\n\n`, + ); + } + res.write( + `event: response.content_part.done\ndata: ${JSON.stringify({ + type: 'response.content_part.done', + item_id: activeMessage.itemId, + output_index: activeMessage.outputIndex, + content_index: activeMessage.contentIndex, + part: { + type: 'output_text', + text: finalText, + annotations: [], + }, + })}\n\n`, + ); + res.write( + `event: response.output_item.done\ndata: ${JSON.stringify({ + type: 'response.output_item.done', + output_index: activeMessage.outputIndex, + item: { + id: activeMessage.itemId, + type: 'message', + role: 'assistant', + status: 'completed', + content: [ + { type: 'output_text', text: finalText, annotations: [] }, + ], + }, + })}\n\n`, + ); + activeMessage = null; + }; + + // Read one SSE stream, track metadata + in-progress items, optionally + // emit synthetic closure events, then forward each frame to the client. + // Returns whether we saw the [DONE] sentinel. const pumpStream = async (upstream: globalThis.Response) => { if (!upstream.body) return false; const reader = upstream.body.getReader(); @@ -155,41 +218,96 @@ if (agentBackendUrl) { while (true) { const { done, value } = await reader.read(); if (done) break; - res.write(value); buf += decoder.decode(value, { stream: true }); - // Pull out complete SSE frames (separated by \n\n) to sniff metadata. const frames = buf.split(/\n\n/); buf = frames.pop() || ''; for (const frame of frames) { + const frameBytes = `${frame}\n\n`; if (frame.includes('data: [DONE]')) { + res.write(frameBytes); return true; } const dataLine = frame.split('\n').find((l) => l.startsWith('data:')); - if (!dataLine) continue; + if (!dataLine) { + res.write(frameBytes); + continue; + } + let parsed: Record | undefined; try { - const parsed = JSON.parse(dataLine.slice(5).trim()); - // Accept response_id from the dedicated top-level tag, or fall - // back to the response.id / top-level id shapes so this works - // across LongRunningAgentServer versions. - const rid = - parsed.response_id ?? - parsed.response?.id ?? - (typeof parsed.id === 'string' && parsed.id.startsWith('resp_') - ? parsed.id - : null); - if (!responseId && typeof rid === 'string') { - responseId = rid; - onFirstResponseId(responseId); - } - if ( - typeof parsed.sequence_number === 'number' && - parsed.sequence_number > lastSeq - ) { - lastSeq = parsed.sequence_number; - } + parsed = JSON.parse(dataLine.slice(5).trim()); } catch { - // Non-JSON SSE frame (e.g. heartbeats) — safe to ignore. + // Non-JSON SSE frame (e.g. heartbeats) — forward as-is. + res.write(frameBytes); + continue; + } + if (!parsed) { + res.write(frameBytes); + continue; + } + // Track response_id (several possible locations). + const nested = parsed.response as + | { id?: unknown } + | undefined; + const rid = + (typeof parsed.response_id === 'string' + ? (parsed.response_id as string) + : undefined) ?? + (typeof nested?.id === 'string' ? nested.id : undefined) ?? + (typeof parsed.id === 'string' && + (parsed.id as string).startsWith('resp_') + ? (parsed.id as string) + : undefined); + if (!responseId && typeof rid === 'string') { + responseId = rid; + onFirstResponseId(responseId); + } + if ( + typeof parsed.sequence_number === 'number' && + (parsed.sequence_number as number) > lastSeq + ) { + lastSeq = parsed.sequence_number as number; + } + const eventType = parsed.type as string | undefined; + const item = (parsed.item as Record | undefined) ?? undefined; + // Update activeMessage state (pre-forward). + if ( + eventType === 'response.output_item.added' && + item?.type === 'message' + ) { + activeMessage = { + itemId: (item.id as string) || '', + outputIndex: (parsed.output_index as number) ?? 0, + contentIndex: 0, + text: '', + }; + } else if ( + eventType === 'response.output_text.delta' && + activeMessage && + (parsed.item_id as string) === activeMessage.itemId + ) { + activeMessage.text += (parsed.delta as string) ?? ''; + } else if ( + eventType === 'response.output_item.done' && + item?.type === 'message' && + activeMessage?.itemId === (item.id as string) + ) { + // Backend closed the message itself; we don't need our synthetic + // closure. + activeMessage = null; } + // On the resume sentinel, seal any active message BEFORE forwarding + // the sentinel itself. Subsequent attempt-2 events will naturally + // emit a fresh output_item.added with a new item_id — the client + // sees a clean second assistant bubble. The interruption suffix + // was already appended when the upstream first closed (see the + // auto-resume loop below); seal with no additional suffix. + if (eventType === 'response.resumed' && activeMessage) { + console.log( + `[/invocations] sealing interrupted message item=${activeMessage.itemId} text_len=${activeMessage.text.length}`, + ); + sealActiveMessage(''); + } + res.write(frameBytes); } } return false; @@ -217,6 +335,28 @@ if (agentBackendUrl) { console.log( `[/invocations] upstream closed without [DONE] response_id=${responseId} last_seq=${lastSeq}; entering auto-resume`, ); + // Surface the interruption to the user right away — otherwise they'd + // see the partial text sit frozen for ~10s until the stale threshold + // expires and the backend emits response.resumed. The seal-on-resume + // path below will also append text if sentinel arrives, but this + // first suffix makes the 'something is happening' signal immediate. + // Wrap in a helper call so TS widens the type back — async functions + // mutating closure variables aren't tracked through ``await`` boundaries. + const readActive = (): ActiveMessage | null => activeMessage; + const am = readActive(); + if (am) { + const suffix = '\n\n_(connection interrupted — reconnecting…)_'; + res.write( + `event: response.output_text.delta\ndata: ${JSON.stringify({ + type: 'response.output_text.delta', + item_id: am.itemId, + output_index: am.outputIndex, + content_index: am.contentIndex, + delta: suffix, + })}\n\n`, + ); + am.text += suffix; + } } while (!sawDone && responseId && resumeAttempt < MAX_RESUME_ATTEMPTS) { resumeAttempt += 1; From a9c94c470dddab5946659c1fae74dda471368337 Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Mon, 20 Apr 2026 22:31:27 +0000 Subject: [PATCH 11/47] Heal orphan tool_calls in the OpenAI Session on every turn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Durable-resume can interrupt the pod between an LLM emitting tool_calls and the SDK finishing the tool executions — the Session is left with function_call items whose matching function_call_output never got written. The next LLM request over that session fails: 400 BAD_REQUEST: An assistant message with 'tool_calls' must be followed by tool messages responding to each 'tool_call_id'. The following tool_call_ids did not have response messages: call_xxx, call_yyy, ... Piggy-back on deduplicate_input (which already touches the session each turn) to inject synthetic function_call_output items for every orphan function_call. Message is plain-text, so the LLM sees 'tool X was interrupted, please retry if needed' and can decide whether to re-call or continue. No change to agent.py. --- agent-openai-advanced/agent_server/utils.py | 57 +++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/agent-openai-advanced/agent_server/utils.py b/agent-openai-advanced/agent_server/utils.py index 13b90cbb..67f5dba8 100644 --- a/agent-openai-advanced/agent_server/utils.py +++ b/agent-openai-advanced/agent_server/utils.py @@ -150,6 +150,59 @@ def replace_fake_id(obj: Any, real_id: str) -> Any: return obj +async def _heal_orphan_tool_calls(session: AsyncDatabricksSession) -> None: + """Add synthetic function_call_output for any function_call missing one. + + Durable-resume can interrupt the pod between the LLM emitting tool_calls + and the SDK finishing the tool executions — when that happens the Session + is left with function_call items whose matching function_call_output + never got written. The very next LLM request over this session fails + with 'assistant message with tool_calls must be followed by tool + messages…'. Patching in a short synthetic output keeps the conversation + valid so the LLM can retry or acknowledge the interruption cleanly. + """ + items = await session.get_items() + + def _get(item, key): + if isinstance(item, dict): + return item.get(key) + return getattr(item, key, None) + + call_ids_with_output: set[str] = set() + pending_calls: list[tuple[str, str]] = [] + for item in items: + t = _get(item, "type") + call_id = _get(item, "call_id") + if not call_id: + continue + if t == "function_call_output": + call_ids_with_output.add(call_id) + elif t == "function_call": + pending_calls.append((call_id, _get(item, "name") or "")) + + orphans = [ + { + "type": "function_call_output", + "call_id": cid, + "output": ( + f"Tool call '{name}' was interrupted by a durable resume " + "and did not complete. Please retry if still needed." + ), + } + for cid, name in pending_calls + if cid not in call_ids_with_output + ] + if orphans: + logger.info( + "Sanitizing session %s: injecting %d synthetic function_call_output " + "items for orphan tool calls (ids=%s)", + session.session_id, + len(orphans), + [o["call_id"] for o in orphans], + ) + await session.add_items(orphans) + + async def deduplicate_input(request: ResponsesAgentRequest, session: AsyncDatabricksSession) -> list[dict]: """Return the input messages to pass to the Runner, avoiding duplication with session history. @@ -157,7 +210,11 @@ async def deduplicate_input(request: ResponsesAgentRequest, session: AsyncDatabr that history persisted, passing everything through would duplicate messages. If the session already covers the prior turns, only the latest message is needed since the session will prepend the full history automatically. + + Also heals any orphan tool_calls left by a prior durable-resume interrupt + so the next LLM request over this session is a valid conversation. """ + await _heal_orphan_tool_calls(session) messages = [i.model_dump() for i in request.input] # Empty input is a valid signal from the long-running server to resume an # existing session without re-sending user content — the session already From 7f15d2fceca66626ea031853ea4d0cc7b8ba5a04 Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Mon, 20 Apr 2026 22:41:06 +0000 Subject: [PATCH 12/47] Sanitize OpenAI session: dedupe + inject synthetic outputs in-place The previous heal added synthetic function_call_output at the END of the session (add_items only appends). When the conversation has a message between the orphan function_call and the synthetic output, the SDK rebuilds the LLM request as an assistant-with-tool_calls message that doesn't have its tool responses right after it, and the API rejects with 'assistant message with tool_calls must be followed by tool messages'. Also: the Vercel AI SDK client echoes the full conversation back each turn. deduplicate_input drops most of it but the Runner.run path can still re-persist prior items, leaving DUPLICATE function_call rows for the same call_id. Replace with a clear+rebuild sanitize pass: dedupe function_call / function_call_output by call_id, inject synthetic outputs immediately after any orphan function_call, clear the session, and re-add the canonical sequence. No-op when already clean. --- agent-openai-advanced/agent_server/utils.py | 158 ++++++++++++++------ 1 file changed, 110 insertions(+), 48 deletions(-) diff --git a/agent-openai-advanced/agent_server/utils.py b/agent-openai-advanced/agent_server/utils.py index 67f5dba8..ce9b71a1 100644 --- a/agent-openai-advanced/agent_server/utils.py +++ b/agent-openai-advanced/agent_server/utils.py @@ -150,57 +150,118 @@ def replace_fake_id(obj: Any, real_id: str) -> Any: return obj -async def _heal_orphan_tool_calls(session: AsyncDatabricksSession) -> None: - """Add synthetic function_call_output for any function_call missing one. - - Durable-resume can interrupt the pod between the LLM emitting tool_calls - and the SDK finishing the tool executions — when that happens the Session - is left with function_call items whose matching function_call_output - never got written. The very next LLM request over this session fails - with 'assistant message with tool_calls must be followed by tool - messages…'. Patching in a short synthetic output keeps the conversation - valid so the LLM can retry or acknowledge the interruption cleanly. +def _item_get(item, key): + if isinstance(item, dict): + return item.get(key) + return getattr(item, key, None) + + +def _item_dict(item): + """Normalize an item to a plain dict for re-persistence.""" + if isinstance(item, dict): + return dict(item) + if hasattr(item, "model_dump"): + return item.model_dump() + return dict(item.__dict__) if hasattr(item, "__dict__") else {} + + +async def _sanitize_session(session: AsyncDatabricksSession) -> None: + """Rebuild the session so the conversation is valid for the next LLM call. + + Two failure modes to handle on each turn: + + 1. **Orphan tool_calls from durable resume.** A kill mid-tool leaves + ``function_call`` items in the session with no matching + ``function_call_output``. The next ``Runner.run`` fails 400 with + 'assistant message with tool_calls must be followed by tool + messages…'. + + 2. **Duplicate items from client history echo.** The Vercel AI SDK + on the frontend re-sends the full conversation in ``request.input`` + every turn. Our ``deduplicate_input`` trims it down to just the + latest user message on the fast path, but on a resumed turn the + SDK can still re-persist prior items (same ``call_id`` appearing + twice). Duplicate ``function_call`` items confuse the LLM API + even when every call_id has *an* output. + + Fix: walk the items in chronological order, dedupe by ``call_id`` for + function_call / function_call_output, and inject a synthetic + ``function_call_output`` immediately after any ``function_call`` whose + matching output isn't present. Clear the session and re-add the + sanitized list so positional ordering is restored (SQLAlchemySession's + ``add_items`` only appends). + + No-op if the session is already clean. """ items = await session.get_items() + if not items: + return - def _get(item, key): - if isinstance(item, dict): - return item.get(key) - return getattr(item, key, None) - + # First pass: collect call_ids that have outputs anywhere in the history. call_ids_with_output: set[str] = set() - pending_calls: list[tuple[str, str]] = [] for item in items: - t = _get(item, "type") - call_id = _get(item, "call_id") - if not call_id: - continue - if t == "function_call_output": - call_ids_with_output.add(call_id) - elif t == "function_call": - pending_calls.append((call_id, _get(item, "name") or "")) - - orphans = [ - { - "type": "function_call_output", - "call_id": cid, - "output": ( - f"Tool call '{name}' was interrupted by a durable resume " - "and did not complete. Please retry if still needed." - ), - } - for cid, name in pending_calls - if cid not in call_ids_with_output - ] - if orphans: - logger.info( - "Sanitizing session %s: injecting %d synthetic function_call_output " - "items for orphan tool calls (ids=%s)", - session.session_id, - len(orphans), - [o["call_id"] for o in orphans], - ) - await session.add_items(orphans) + if _item_get(item, "type") == "function_call_output": + cid = _item_get(item, "call_id") + if cid: + call_ids_with_output.add(cid) + + # Second pass: build the canonical sequence. Dedup function_call / + # function_call_output by call_id, insert synthetic outputs where + # missing, keep messages / other items as-is. + sanitized: list[dict] = [] + seen_calls: set[str] = set() + seen_outputs: set[str] = set() + needed_injection: list[str] = [] + + for item in items: + t = _item_get(item, "type") + cid = _item_get(item, "call_id") + if t == "function_call" and cid: + if cid in seen_calls: + continue # drop duplicate + seen_calls.add(cid) + sanitized.append(_item_dict(item)) + # If this function_call has no matching output anywhere in the + # session, inject a synthetic one immediately after it. + if cid not in call_ids_with_output: + name = _item_get(item, "name") or "" + sanitized.append( + { + "type": "function_call_output", + "call_id": cid, + "output": ( + f"Tool call '{name}' was interrupted by a durable " + "resume and did not complete. Please retry if " + "still needed." + ), + } + ) + needed_injection.append(cid) + elif t == "function_call_output" and cid: + if cid in seen_outputs: + continue # drop duplicate output + seen_outputs.add(cid) + sanitized.append(_item_dict(item)) + else: + sanitized.append(_item_dict(item)) + + # If the sanitized sequence equals the original (same count, no orphans, + # no duplicates), skip the clear+rebuild — it's a no-op and saves DB work. + if len(sanitized) == len(items) and not needed_injection: + return + + logger.info( + "Sanitizing session %s: original=%d items, sanitized=%d items, " + "synthetic outputs injected=%d (call_ids=%s)", + session.session_id, + len(items), + len(sanitized), + len(needed_injection), + needed_injection, + ) + await session.clear_session() + if sanitized: + await session.add_items(sanitized) async def deduplicate_input(request: ResponsesAgentRequest, session: AsyncDatabricksSession) -> list[dict]: @@ -211,10 +272,11 @@ async def deduplicate_input(request: ResponsesAgentRequest, session: AsyncDatabr If the session already covers the prior turns, only the latest message is needed since the session will prepend the full history automatically. - Also heals any orphan tool_calls left by a prior durable-resume interrupt + Also sanitizes the session (dedupes duplicate items and injects synthetic + outputs for orphan tool_calls left behind by a durable-resume interrupt) so the next LLM request over this session is a valid conversation. """ - await _heal_orphan_tool_calls(session) + await _sanitize_session(session) messages = [i.model_dump() for i in request.input] # Empty input is a valid signal from the long-running server to resume an # existing session without re-sending user content — the session already From 61dcfc03927f511453c489202f3440f345a6a11c Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Mon, 20 Apr 2026 22:54:24 +0000 Subject: [PATCH 13/47] Replace interrupted text with attempt 2's on durable resume MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Keep the UI minimal but fix the doubled-text issue: when a mid-stream kill happens, the AI SDK merges all deltas within one streamText call into one UIMessage — so our proxy-level seal events were valid but invisible, and attempt 2's text kept appending to attempt 1's partial. Minimal solution: 1. Express /invocations proxy already emits response.resumed at the attempt boundary (unchanged). 2. chat.ts server: detect response.resumed via onChunk and forward it to the UI stream as { type: 'data-resumed', data: { attempt } }. 3. chat.tsx client: on 'data-resumed', call setMessages to drop all text parts from the last (assistant) message. Tool call parts stay because they dedupe by call_id naturally. Also: fix auto-resume loop burning MAX_RESUME_ATTEMPTS on terminal errors by exiting early when an error event with code=task_failed or code=task_timeout comes through the proxy. No changes to agent.py. Agnosticism tenet intact. --- .../client/src/components/chat.tsx | 17 ++++++++++ .../packages/core/src/types.ts | 3 ++ e2e-chatbot-app-next/server/src/index.ts | 20 +++++++++++- .../server/src/routes/chat.ts | 31 +++++++++++++++++++ 4 files changed, 70 insertions(+), 1 deletion(-) diff --git a/e2e-chatbot-app-next/client/src/components/chat.tsx b/e2e-chatbot-app-next/client/src/components/chat.tsx index 735b894c..ca16ab65 100644 --- a/e2e-chatbot-app-next/client/src/components/chat.tsx +++ b/e2e-chatbot-app-next/client/src/components/chat.tsx @@ -181,6 +181,23 @@ export function Chat({ setTitlePending(false); fetchChatHistory(); } + // Durable-resume visual reset: when the backend's LongRunningAgentServer + // emits response.resumed (mid-stream pod crash + reclaim), the chat route + // writes a data-resumed part to signal us. Drop any text parts we've + // accumulated from the interrupted attempt so only the new attempt's + // text renders in-place. Tool parts are kept because they naturally + // de-dup across attempts via call_id. + if (dataPart.type === 'data-resumed') { + setMessages((prev) => { + if (!prev.length) return prev; + const last = prev[prev.length - 1]; + if (last.role !== 'assistant') return prev; + const filtered = (last.parts ?? []).filter( + (p: { type?: string }) => p.type !== 'text', + ); + return [...prev.slice(0, -1), { ...last, parts: filtered }]; + }); + } }, onFinish: ({ isAbort, diff --git a/e2e-chatbot-app-next/packages/core/src/types.ts b/e2e-chatbot-app-next/packages/core/src/types.ts index fac94494..549a3093 100644 --- a/e2e-chatbot-app-next/packages/core/src/types.ts +++ b/e2e-chatbot-app-next/packages/core/src/types.ts @@ -12,6 +12,9 @@ export type CustomUIDataTypes = { usage: LanguageModelUsage; traceId: string | null; title: string; + // Emitted by the server at a durable-resume boundary so the client can + // drop text parts accumulated from the interrupted attempt. + resumed: { attempt: number }; }; export type ChatMessage = UIMessage; diff --git a/e2e-chatbot-app-next/server/src/index.ts b/e2e-chatbot-app-next/server/src/index.ts index 4385148e..9081fec9 100644 --- a/e2e-chatbot-app-next/server/src/index.ts +++ b/e2e-chatbot-app-next/server/src/index.ts @@ -137,6 +137,10 @@ if (agentBackendUrl) { let responseId: string | null = null; let lastSeq = 0; let sawDone = false; + // Terminal-error flag: if the backend emits a task_failed error event + // (e.g. upstream LLM returned 502, task_timeout, permanent failure), + // exit the resume loop instead of hammering retrieve N more times. + let sawTerminalError = false; const onFirstResponseId = (rid: string) => { console.log(`[/invocations] background started response_id=${rid}`); }; @@ -307,6 +311,20 @@ if (agentBackendUrl) { ); sealActiveMessage(''); } + // Detect terminal errors (task_failed, task_timeout, etc.) so we + // don't burn MAX_RESUME_ATTEMPTS fetching a response that will + // never succeed. Upstream LLM 502s and permanent run failures + // both surface here. + if (eventType === 'error') { + const errObj = (parsed.error as Record) || {}; + const code = errObj.code as string | undefined; + if (code === 'task_failed' || code === 'task_timeout') { + console.log( + `[/invocations] terminal error code=${code} response_id=${responseId}; not retrying`, + ); + sawTerminalError = true; + } + } res.write(frameBytes); } } @@ -358,7 +376,7 @@ if (agentBackendUrl) { am.text += suffix; } } - while (!sawDone && responseId && resumeAttempt < MAX_RESUME_ATTEMPTS) { + while (!sawDone && !sawTerminalError && responseId && resumeAttempt < MAX_RESUME_ATTEMPTS) { resumeAttempt += 1; console.log( `[/invocations] resume fetch response_id=${responseId} starting_after=${lastSeq} attempt=${resumeAttempt}`, diff --git a/e2e-chatbot-app-next/server/src/routes/chat.ts b/e2e-chatbot-app-next/server/src/routes/chat.ts index e2f4080e..8f4ee01a 100644 --- a/e2e-chatbot-app-next/server/src/routes/chat.ts +++ b/e2e-chatbot-app-next/server/src/routes/chat.ts @@ -258,6 +258,16 @@ chatRouter.post('/', requireAuth, async (req: Request, res: Response) => { : {}), }; + // Track whether we've seen a durable-resume boundary so we can forward + // exactly one data-resumed event to the client (which uses it to wipe + // the interrupted attempt's text parts). writerRef is populated by the + // execute() callback below — onChunk runs inside the same stream and + // needs live access to the writer to push a data part mid-stream. + const writerRef: { current: { write: (part: unknown) => void } | null } = { + current: null, + }; + const emittedResumedAttempts = new Set(); + const result = streamText({ model, messages: modelMessages, @@ -281,6 +291,24 @@ chatRouter.post('/', requireAuth, async (req: Request, res: Response) => { if (!traceId && typeof raw?.trace_id === 'string') { traceId = raw.trace_id; } + // LongRunningAgentServer emits this at the attempt-1 → attempt-2 + // boundary after a crash + CAS claim. Forward it once to the + // client as a data-resumed part so the UI can drop the + // interrupted attempt's text parts (tools keep their cards). + if (raw?.type === 'response.resumed' && writerRef.current) { + const attempt = typeof raw?.attempt === 'number' ? raw.attempt : 2; + if (!emittedResumedAttempts.has(attempt)) { + emittedResumedAttempts.add(attempt); + try { + writerRef.current.write({ + type: 'data-resumed', + data: { attempt }, + }); + } catch (e) { + console.warn('[chat] failed to forward data-resumed:', e); + } + } + } } }, onFinish: ({ usage }) => { @@ -303,6 +331,9 @@ chatRouter.post('/', requireAuth, async (req: Request, res: Response) => { // rather than the AI SDK's default short-id format (e.g. "Xt8nZiQRj1fS4yiU"). generateId: generateUUID, execute: async ({ writer }) => { + // Expose writer to onChunk so it can forward data-resumed events + // the instant a durable-resume boundary is observed. + writerRef.current = writer as unknown as typeof writerRef.current; // Manually drain the AI stream so we can append the traceId data part // after all model chunks are processed (traceId is captured via onChunk). // result.toUIMessageStream() converts TextStreamPart → UIMessageChunk: From 75dae5d1bd0d6a2643d77b24bf06e386588292b0 Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Mon, 20 Apr 2026 22:58:51 +0000 Subject: [PATCH 14/47] Add debug logs for durable-resume data-resumed event propagation --- .../client/src/components/chat.tsx | 14 ++++++++++- .../server/src/routes/chat.ts | 23 +++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/e2e-chatbot-app-next/client/src/components/chat.tsx b/e2e-chatbot-app-next/client/src/components/chat.tsx index ca16ab65..77fc8a75 100644 --- a/e2e-chatbot-app-next/client/src/components/chat.tsx +++ b/e2e-chatbot-app-next/client/src/components/chat.tsx @@ -188,13 +188,25 @@ export function Chat({ // text renders in-place. Tool parts are kept because they naturally // de-dup across attempts via call_id. if (dataPart.type === 'data-resumed') { + console.log('[chat][onData] got data-resumed', dataPart); setMessages((prev) => { - if (!prev.length) return prev; + if (!prev.length) { + console.log('[chat][onData] no prev messages; ignoring'); + return prev; + } const last = prev[prev.length - 1]; + console.log( + `[chat][onData] last message role=${last.role} parts=${JSON.stringify( + (last.parts ?? []).map((p: { type?: string }) => p.type), + )}`, + ); if (last.role !== 'assistant') return prev; const filtered = (last.parts ?? []).filter( (p: { type?: string }) => p.type !== 'text', ); + console.log( + `[chat][onData] filtered: ${(last.parts ?? []).length} -> ${filtered.length} parts`, + ); return [...prev.slice(0, -1), { ...last, parts: filtered }]; }); } diff --git a/e2e-chatbot-app-next/server/src/routes/chat.ts b/e2e-chatbot-app-next/server/src/routes/chat.ts index 8f4ee01a..40b29ff3 100644 --- a/e2e-chatbot-app-next/server/src/routes/chat.ts +++ b/e2e-chatbot-app-next/server/src/routes/chat.ts @@ -279,6 +279,17 @@ chatRouter.post('/', requireAuth, async (req: Request, res: Response) => { onChunk: ({ chunk }) => { if (chunk.type === 'raw') { const raw = chunk.rawValue as any; + // Debug: surface any event type containing 'resum' so we can tell + // whether the AI SDK's responses provider is preserving the raw + // type intact. Remove once durable-resume UI is stable. + if ( + typeof raw?.type === 'string' && + raw.type.toLowerCase().includes('resum') + ) { + console.log( + `[chat][onChunk] raw event type=${raw.type} keys=${Object.keys(raw).join(',')}`, + ); + } // Extract trace in Databricks serving endpoint output format, if present if (raw?.type === 'response.output_item.done') { const traceIdFromChunk = @@ -295,6 +306,11 @@ chatRouter.post('/', requireAuth, async (req: Request, res: Response) => { // boundary after a crash + CAS claim. Forward it once to the // client as a data-resumed part so the UI can drop the // interrupted attempt's text parts (tools keep their cards). + if (raw?.type === 'response.resumed') { + console.log( + `[chat][onChunk] saw response.resumed raw event attempt=${raw?.attempt} writer_ready=${!!writerRef.current}`, + ); + } if (raw?.type === 'response.resumed' && writerRef.current) { const attempt = typeof raw?.attempt === 'number' ? raw.attempt : 2; if (!emittedResumedAttempts.has(attempt)) { @@ -304,9 +320,16 @@ chatRouter.post('/', requireAuth, async (req: Request, res: Response) => { type: 'data-resumed', data: { attempt }, }); + console.log( + `[chat][onChunk] forwarded data-resumed attempt=${attempt} to UI stream`, + ); } catch (e) { console.warn('[chat] failed to forward data-resumed:', e); } + } else { + console.log( + `[chat][onChunk] already forwarded data-resumed for attempt=${attempt}, skipping`, + ); } } } From 47a063f481ec0e4b7c93839d98e4cd3b1c571e42 Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Mon, 20 Apr 2026 23:01:24 +0000 Subject: [PATCH 15/47] Catch-all log in onData to trace which data parts reach client --- e2e-chatbot-app-next/client/src/components/chat.tsx | 1 + 1 file changed, 1 insertion(+) diff --git a/e2e-chatbot-app-next/client/src/components/chat.tsx b/e2e-chatbot-app-next/client/src/components/chat.tsx index 77fc8a75..0202c147 100644 --- a/e2e-chatbot-app-next/client/src/components/chat.tsx +++ b/e2e-chatbot-app-next/client/src/components/chat.tsx @@ -170,6 +170,7 @@ export function Chat({ }, }), onData: (dataPart) => { + console.log(`[chat][onData] received dataPart type=${dataPart.type}`, dataPart); setDataStream((ds) => ds ? [...ds, dataPart as DataUIPart] : [], ); From c27c016a761bb72c8a636c0f3c8a52768ca0f2a3 Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Mon, 20 Apr 2026 23:05:22 +0000 Subject: [PATCH 16/47] Wipe text in-place on data-resumed instead of removing the part --- .../client/src/components/chat.tsx | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/e2e-chatbot-app-next/client/src/components/chat.tsx b/e2e-chatbot-app-next/client/src/components/chat.tsx index 0202c147..9d6a6681 100644 --- a/e2e-chatbot-app-next/client/src/components/chat.tsx +++ b/e2e-chatbot-app-next/client/src/components/chat.tsx @@ -198,17 +198,27 @@ export function Chat({ const last = prev[prev.length - 1]; console.log( `[chat][onData] last message role=${last.role} parts=${JSON.stringify( - (last.parts ?? []).map((p: { type?: string }) => p.type), + (last.parts ?? []).map((p: { type?: string; text?: string }) => ({ + type: p.type, + len: p.text?.length, + })), )}`, ); if (last.role !== 'assistant') return prev; - const filtered = (last.parts ?? []).filter( - (p: { type?: string }) => p.type !== 'text', + // Reset the text content in place — removing the part lets the AI + // SDK recreate it with a fresh id AND the stale accumulated text. + // Instead, keep the same part id and wipe .text so the next delta + // appends to an empty string. Future deltas from attempt 2 will + // still hit the same text part; that's fine, the user just sees + // attempt 2's output. Tool parts keep their cards. + const updatedParts = (last.parts ?? []).map( + (p: { type?: string; text?: string }) => + p.type === 'text' ? { ...p, text: '' } : p, ); console.log( - `[chat][onData] filtered: ${(last.parts ?? []).length} -> ${filtered.length} parts`, + `[chat][onData] wiped text parts in place; parts remain ${updatedParts.length}`, ); - return [...prev.slice(0, -1), { ...last, parts: filtered }]; + return [...prev.slice(0, -1), { ...last, parts: updatedParts }]; }); } }, From bfd8f6ea0d86efe2bbb2c6ae124e0a96d088b9e4 Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Mon, 20 Apr 2026 23:13:31 +0000 Subject: [PATCH 17/47] Post-stream truncate as belt-and-suspenders for durable-resume text wipe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Your 'clean up at end of stream' idea — much more robust than relying on mid-stream mutation sticking. On data-resumed we now snapshot the attempt-1 text length, and in onFinish we slice exactly that many chars off the front of the last assistant message's text parts. Whatever the AI SDK accumulator did during streaming, the final rendered state contains only attempt 2's content. The mid-stream mutation wipe stays in place too — when it sticks the text visibly clears during the 10s stale window, which is nicer UX than waiting for onFinish. When it doesn't stick, onFinish catches it. --- .../client/src/components/chat.tsx | 78 +++++++++++++------ 1 file changed, 55 insertions(+), 23 deletions(-) diff --git a/e2e-chatbot-app-next/client/src/components/chat.tsx b/e2e-chatbot-app-next/client/src/components/chat.tsx index 9d6a6681..90df11b3 100644 --- a/e2e-chatbot-app-next/client/src/components/chat.tsx +++ b/e2e-chatbot-app-next/client/src/components/chat.tsx @@ -69,6 +69,14 @@ export function Chat({ const resumeAttemptCountRef = useRef(0); const maxResumeAttempts = 3; + // Durable-resume snapshot: on data-resumed we record the length of the + // assistant message's text at that instant. If the mid-stream mutation + // wipe doesn't stick (AI SDK may re-batch deltas from its accumulator), + // onFinish does a post-stream truncation: drop the first N chars of the + // final text so only attempt-2 content remains. Keyed by message id so + // multi-turn conversations don't cross-contaminate. + const attempt1TextLenRef = useRef>({}); + const abortController = useRef(new AbortController()); useEffect(() => { return () => { @@ -190,35 +198,28 @@ export function Chat({ // de-dup across attempts via call_id. if (dataPart.type === 'data-resumed') { console.log('[chat][onData] got data-resumed', dataPart); + // Snapshot the current text length across all text parts of the last + // assistant message. onFinish below will use this to truncate the + // final rendered text so only attempt 2's content remains visible. setMessages((prev) => { - if (!prev.length) { - console.log('[chat][onData] no prev messages; ignoring'); - return prev; - } + if (!prev.length) return prev; const last = prev[prev.length - 1]; - console.log( - `[chat][onData] last message role=${last.role} parts=${JSON.stringify( - (last.parts ?? []).map((p: { type?: string; text?: string }) => ({ - type: p.type, - len: p.text?.length, - })), - )}`, - ); if (last.role !== 'assistant') return prev; - // Reset the text content in place — removing the part lets the AI - // SDK recreate it with a fresh id AND the stale accumulated text. - // Instead, keep the same part id and wipe .text so the next delta - // appends to an empty string. Future deltas from attempt 2 will - // still hit the same text part; that's fine, the user just sees - // attempt 2's output. Tool parts keep their cards. - const updatedParts = (last.parts ?? []).map( - (p: { type?: string; text?: string }) => - p.type === 'text' ? { ...p, text: '' } : p, + const currentLen = (last.parts ?? []).reduce( + (acc: number, p: { type?: string; text?: string }) => + p.type === 'text' ? acc + (p.text?.length ?? 0) : acc, + 0, ); + attempt1TextLenRef.current[last.id] = currentLen; console.log( - `[chat][onData] wiped text parts in place; parts remain ${updatedParts.length}`, + `[chat][onData] recorded attempt-1 text length=${currentLen} for message=${last.id}`, ); - return [...prev.slice(0, -1), { ...last, parts: updatedParts }]; + // Also mutate in place as a best-effort mid-stream wipe. + for (const p of last.parts ?? []) { + const tp = p as { type?: string; text?: string }; + if (tp.type === 'text') tp.text = ''; + } + return [...prev]; }); } }, @@ -231,6 +232,37 @@ export function Chat({ didFetchHistoryOnNewChat.current = false; setTitlePending(false); + // Post-stream durable-resume truncation. If we saw data-resumed during + // this stream we recorded how many chars of text belonged to attempt 1. + // Now that the stream is complete (attempt 2 produced its answer), + // chop off exactly that many chars from the start so only attempt 2's + // text is rendered. Belt-and-suspenders over the mid-stream wipe. + const lastAssistant = finishedMessages?.at(-1); + if (lastAssistant && lastAssistant.role === 'assistant') { + const drop = attempt1TextLenRef.current[lastAssistant.id]; + if (drop && drop > 0) { + console.log( + `[chat][onFinish] post-stream truncate: removing first ${drop} chars of text parts for message=${lastAssistant.id}`, + ); + setMessages((prev) => { + if (!prev.length) return prev; + const last = prev[prev.length - 1]; + if (last.id !== lastAssistant.id) return prev; + let remaining = drop; + for (const p of last.parts ?? []) { + const tp = p as { type?: string; text?: string }; + if (tp.type !== 'text' || !tp.text) continue; + if (remaining <= 0) break; + const cut = Math.min(remaining, tp.text.length); + tp.text = tp.text.slice(cut); + remaining -= cut; + } + delete attempt1TextLenRef.current[lastAssistant.id]; + return [...prev]; + }); + } + } + // If user aborted, don't try to resume if (isAbort) { console.log('[Chat onFinish] Stream was aborted by user, not resuming'); From 931ab0f58e89573148212d4903d98904b4638b0d Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Mon, 20 Apr 2026 23:22:44 +0000 Subject: [PATCH 18/47] Post-stream truncate must create new part + message refs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PreviewMessage is memoized: while loading it compares prevProps.message to nextProps.message by reference; when not loading it deep-equals the parts array (which short-circuits on identical references). Our previous truncate mutated part.text in place and returned [...prev] — same message + same parts array refs, so the memo skipped the re-render and the old text stuck on screen even though state was technically updated. Map to NEW part objects with sliced text and wrap a NEW message object so both the reference check (loading path) and deep-equal (done path) see a change and re-render. --- .../client/src/components/chat.tsx | 25 +++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/e2e-chatbot-app-next/client/src/components/chat.tsx b/e2e-chatbot-app-next/client/src/components/chat.tsx index 90df11b3..a6c45b2a 100644 --- a/e2e-chatbot-app-next/client/src/components/chat.tsx +++ b/e2e-chatbot-app-next/client/src/components/chat.tsx @@ -236,7 +236,9 @@ export function Chat({ // this stream we recorded how many chars of text belonged to attempt 1. // Now that the stream is complete (attempt 2 produced its answer), // chop off exactly that many chars from the start so only attempt 2's - // text is rendered. Belt-and-suspenders over the mid-stream wipe. + // text is rendered. Must return NEW message + part object references — + // PreviewMessage is memoized and a reference-equal message short- + // circuits the re-render even if we mutated nested `.text` in place. const lastAssistant = finishedMessages?.at(-1); if (lastAssistant && lastAssistant.role === 'assistant') { const drop = attempt1TextLenRef.current[lastAssistant.id]; @@ -249,16 +251,25 @@ export function Chat({ const last = prev[prev.length - 1]; if (last.id !== lastAssistant.id) return prev; let remaining = drop; - for (const p of last.parts ?? []) { + const newParts = (last.parts ?? []).map((p) => { const tp = p as { type?: string; text?: string }; - if (tp.type !== 'text' || !tp.text) continue; - if (remaining <= 0) break; + if (tp.type !== 'text' || !tp.text) return p; + if (remaining <= 0) return p; const cut = Math.min(remaining, tp.text.length); - tp.text = tp.text.slice(cut); + const nextText = tp.text.slice(cut); remaining -= cut; - } + return { ...tp, text: nextText }; + }); delete attempt1TextLenRef.current[lastAssistant.id]; - return [...prev]; + const newLast = { ...last, parts: newParts }; + console.log( + `[chat][onFinish] truncated; new text parts lengths=${JSON.stringify( + newParts + .filter((p) => (p as { type?: string }).type === 'text') + .map((p) => (p as { text?: string }).text?.length), + )}`, + ); + return [...prev.slice(0, -1), newLast]; }); } } From 412e14f697f38f2d530c346593f84f87893d780d Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Mon, 20 Apr 2026 23:32:08 +0000 Subject: [PATCH 19/47] Mid-stream text replacement via render-time slice in Messages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit State-level wipes were getting clobbered by the AI SDK accumulator — ReactChatState.replaceMessage deep-clones state.message on every write(), and activeTextParts keeps mutating the originals behind the UI's back. Solution: transform at the VIEW layer instead of fighting the state machine. Chat component tracks attempt1TextLen per messageId (state, not ref, so it propagates to children). Messages maps each message through a render-time slice that drops the leading attempt-1 chars from text parts before passing to PreviewMessage. Creates new message + part objects so the memo's reference check trips and the component re-renders. onFinish still does the authoritative setMessages truncate so the persisted-to-DB final message reflects only attempt 2. That truncate now also clears attempt1TextLen, so the render-time slice becomes a no-op after completion (state is already truncated). --- .../client/src/components/chat.tsx | 91 +++++++++++-------- .../client/src/components/messages.tsx | 65 +++++++++---- 2 files changed, 100 insertions(+), 56 deletions(-) diff --git a/e2e-chatbot-app-next/client/src/components/chat.tsx b/e2e-chatbot-app-next/client/src/components/chat.tsx index a6c45b2a..b3c93935 100644 --- a/e2e-chatbot-app-next/client/src/components/chat.tsx +++ b/e2e-chatbot-app-next/client/src/components/chat.tsx @@ -70,12 +70,18 @@ export function Chat({ const maxResumeAttempts = 3; // Durable-resume snapshot: on data-resumed we record the length of the - // assistant message's text at that instant. If the mid-stream mutation - // wipe doesn't stick (AI SDK may re-batch deltas from its accumulator), - // onFinish does a post-stream truncation: drop the first N chars of the - // final text so only attempt-2 content remains. Keyed by message id so - // multi-turn conversations don't cross-contaminate. - const attempt1TextLenRef = useRef>({}); + // assistant message's text at that instant. Used two ways: + // (1) at RENDER time, Messages slices the first N chars off the text + // parts of the active assistant message so mid-stream the UI shows + // only attempt-2 content (state stays as AI SDK writes it). + // (2) onFinish runs a final setMessages truncate so the persisted + // message also reflects only attempt 2 (would otherwise be saved + // with both attempts concatenated). + // Needs to be state (not ref) so the map-time slice in Messages re-runs + // when the snapshot changes. Keyed by message id for multi-turn safety. + const [attempt1TextLen, setAttempt1TextLen] = useState< + Record + >({}); const abortController = useRef(new AbortController()); useEffect(() => { @@ -198,29 +204,37 @@ export function Chat({ // de-dup across attempts via call_id. if (dataPart.type === 'data-resumed') { console.log('[chat][onData] got data-resumed', dataPart); - // Snapshot the current text length across all text parts of the last - // assistant message. onFinish below will use this to truncate the - // final rendered text so only attempt 2's content remains visible. - setMessages((prev) => { - if (!prev.length) return prev; - const last = prev[prev.length - 1]; - if (last.role !== 'assistant') return prev; - const currentLen = (last.parts ?? []).reduce( - (acc: number, p: { type?: string; text?: string }) => - p.type === 'text' ? acc + (p.text?.length ?? 0) : acc, - 0, - ); - attempt1TextLenRef.current[last.id] = currentLen; + // Snapshot the current text length across text parts of the last + // assistant message. Messages uses this to slice at render time. + // Mid-stream state mutation is fighting the AI SDK accumulator + // (replaceMessage structuredClones the message on every write()), + // so we transform at the view layer instead. + const lastAssistantId = (() => { + // Peek into current messages via functional setter without mutating. + let captured: { id: string; len: number } | null = null; + setMessages((prev) => { + if (prev.length) { + const last = prev[prev.length - 1]; + if (last.role === 'assistant') { + const currentLen = (last.parts ?? []).reduce( + (acc: number, p: { type?: string; text?: string }) => + p.type === 'text' ? acc + (p.text?.length ?? 0) : acc, + 0, + ); + captured = { id: last.id, len: currentLen }; + } + } + return prev; + }); + return captured; + })(); + if (lastAssistantId) { + const { id, len } = lastAssistantId as { id: string; len: number }; + setAttempt1TextLen((prev) => ({ ...prev, [id]: len })); console.log( - `[chat][onData] recorded attempt-1 text length=${currentLen} for message=${last.id}`, + `[chat][onData] recorded attempt-1 text length=${len} for message=${id}`, ); - // Also mutate in place as a best-effort mid-stream wipe. - for (const p of last.parts ?? []) { - const tp = p as { type?: string; text?: string }; - if (tp.type === 'text') tp.text = ''; - } - return [...prev]; - }); + } } }, onFinish: ({ @@ -232,16 +246,13 @@ export function Chat({ didFetchHistoryOnNewChat.current = false; setTitlePending(false); - // Post-stream durable-resume truncation. If we saw data-resumed during - // this stream we recorded how many chars of text belonged to attempt 1. - // Now that the stream is complete (attempt 2 produced its answer), - // chop off exactly that many chars from the start so only attempt 2's - // text is rendered. Must return NEW message + part object references — - // PreviewMessage is memoized and a reference-equal message short- - // circuits the re-render even if we mutated nested `.text` in place. + // Post-stream durable-resume truncation. Persists attempt-2-only text + // into useChat's messages state (what gets saved to DB). The render- + // time slice in Messages handles mid-stream; this handles the final + // committed state. const lastAssistant = finishedMessages?.at(-1); if (lastAssistant && lastAssistant.role === 'assistant') { - const drop = attempt1TextLenRef.current[lastAssistant.id]; + const drop = attempt1TextLen[lastAssistant.id]; if (drop && drop > 0) { console.log( `[chat][onFinish] post-stream truncate: removing first ${drop} chars of text parts for message=${lastAssistant.id}`, @@ -260,8 +271,7 @@ export function Chat({ remaining -= cut; return { ...tp, text: nextText }; }); - delete attempt1TextLenRef.current[lastAssistant.id]; - const newLast = { ...last, parts: newParts }; + const newLast = { ...last, parts: newParts } as ChatMessage; console.log( `[chat][onFinish] truncated; new text parts lengths=${JSON.stringify( newParts @@ -271,6 +281,12 @@ export function Chat({ ); return [...prev.slice(0, -1), newLast]; }); + // Clear so the render-time slice in Messages stops kicking in + // (state is now already truncated). + setAttempt1TextLen((prev) => { + const { [lastAssistant.id]: _omit, ...rest } = prev; + return rest; + }); } } @@ -410,6 +426,7 @@ export function Chat({ isReadonly={isReadonly} selectedModelId={initialChatModel} feedback={feedback} + attempt1TextLen={attempt1TextLen} /> diff --git a/e2e-chatbot-app-next/client/src/components/messages.tsx b/e2e-chatbot-app-next/client/src/components/messages.tsx index 8b740452..aa6a5f9e 100644 --- a/e2e-chatbot-app-next/client/src/components/messages.tsx +++ b/e2e-chatbot-app-next/client/src/components/messages.tsx @@ -18,6 +18,12 @@ interface MessagesProps { isReadonly: boolean; selectedModelId: string; feedback?: FeedbackMap; + // Durable-resume: messageId -> chars of text that belonged to attempt 1. + // When present, slice that many chars off the front of the message's text + // parts at render time so only attempt-2 content shows. State itself is + // left alone because the AI SDK accumulator keeps restoring the full + // text via structuredClone on every write(). + attempt1TextLen?: Record; } function PureMessages({ @@ -30,6 +36,7 @@ function PureMessages({ isReadonly, selectedModelId, feedback = {}, + attempt1TextLen, }: MessagesProps) { const { containerRef: messagesContainerRef, @@ -65,25 +72,45 @@ function PureMessages({ > - {messages.map((message, index) => ( - - ))} + {messages.map((message, index) => { + // Render-time durable-resume slice: if this message had a resume + // boundary recorded, remove the leading attempt-1 chars from its + // text parts before passing to PreviewMessage. Creates a new + // message object so the memo sees a reference change and + // re-renders. Tool / step / data parts are passed through. + let displayMessage = message; + const drop = attempt1TextLen?.[message.id]; + if (drop && drop > 0) { + let remaining = drop; + const newParts = (message.parts ?? []).map((p) => { + const tp = p as { type?: string; text?: string }; + if (tp.type !== 'text' || !tp.text || remaining <= 0) return p; + const cut = Math.min(remaining, tp.text.length); + remaining -= cut; + return { ...tp, text: tp.text.slice(cut) }; + }); + displayMessage = { ...message, parts: newParts } as ChatMessage; + } + return ( + + ); + })} {status === 'submitted' && messages.length > 0 && From b17eec8b17cc82990f20ad9a8ff028e49d2af91f Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Mon, 20 Apr 2026 23:49:54 +0000 Subject: [PATCH 20/47] Remove debug console.logs from durable-resume UI path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop the [chat][onData] / [chat][onFinish] / [chat][onChunk] tracing statements that were used to trace the attempt-1 → attempt-2 flow while tuning the render-time slice and post-stream truncate. The server-side Express proxy still logs resume lifecycle (background started / resume fetch / terminal error / stream done) since that's operationally useful; the ai-bridge backend's [durable] INFO logs stay as-is. Co-authored-by: Isaac --- .../client/src/components/chat.tsx | 15 ------------ .../server/src/routes/chat.ts | 23 ------------------- 2 files changed, 38 deletions(-) diff --git a/e2e-chatbot-app-next/client/src/components/chat.tsx b/e2e-chatbot-app-next/client/src/components/chat.tsx index b3c93935..a9a9b38e 100644 --- a/e2e-chatbot-app-next/client/src/components/chat.tsx +++ b/e2e-chatbot-app-next/client/src/components/chat.tsx @@ -184,7 +184,6 @@ export function Chat({ }, }), onData: (dataPart) => { - console.log(`[chat][onData] received dataPart type=${dataPart.type}`, dataPart); setDataStream((ds) => ds ? [...ds, dataPart as DataUIPart] : [], ); @@ -203,7 +202,6 @@ export function Chat({ // text renders in-place. Tool parts are kept because they naturally // de-dup across attempts via call_id. if (dataPart.type === 'data-resumed') { - console.log('[chat][onData] got data-resumed', dataPart); // Snapshot the current text length across text parts of the last // assistant message. Messages uses this to slice at render time. // Mid-stream state mutation is fighting the AI SDK accumulator @@ -231,9 +229,6 @@ export function Chat({ if (lastAssistantId) { const { id, len } = lastAssistantId as { id: string; len: number }; setAttempt1TextLen((prev) => ({ ...prev, [id]: len })); - console.log( - `[chat][onData] recorded attempt-1 text length=${len} for message=${id}`, - ); } } }, @@ -254,9 +249,6 @@ export function Chat({ if (lastAssistant && lastAssistant.role === 'assistant') { const drop = attempt1TextLen[lastAssistant.id]; if (drop && drop > 0) { - console.log( - `[chat][onFinish] post-stream truncate: removing first ${drop} chars of text parts for message=${lastAssistant.id}`, - ); setMessages((prev) => { if (!prev.length) return prev; const last = prev[prev.length - 1]; @@ -272,13 +264,6 @@ export function Chat({ return { ...tp, text: nextText }; }); const newLast = { ...last, parts: newParts } as ChatMessage; - console.log( - `[chat][onFinish] truncated; new text parts lengths=${JSON.stringify( - newParts - .filter((p) => (p as { type?: string }).type === 'text') - .map((p) => (p as { text?: string }).text?.length), - )}`, - ); return [...prev.slice(0, -1), newLast]; }); // Clear so the render-time slice in Messages stops kicking in diff --git a/e2e-chatbot-app-next/server/src/routes/chat.ts b/e2e-chatbot-app-next/server/src/routes/chat.ts index 40b29ff3..8f4ee01a 100644 --- a/e2e-chatbot-app-next/server/src/routes/chat.ts +++ b/e2e-chatbot-app-next/server/src/routes/chat.ts @@ -279,17 +279,6 @@ chatRouter.post('/', requireAuth, async (req: Request, res: Response) => { onChunk: ({ chunk }) => { if (chunk.type === 'raw') { const raw = chunk.rawValue as any; - // Debug: surface any event type containing 'resum' so we can tell - // whether the AI SDK's responses provider is preserving the raw - // type intact. Remove once durable-resume UI is stable. - if ( - typeof raw?.type === 'string' && - raw.type.toLowerCase().includes('resum') - ) { - console.log( - `[chat][onChunk] raw event type=${raw.type} keys=${Object.keys(raw).join(',')}`, - ); - } // Extract trace in Databricks serving endpoint output format, if present if (raw?.type === 'response.output_item.done') { const traceIdFromChunk = @@ -306,11 +295,6 @@ chatRouter.post('/', requireAuth, async (req: Request, res: Response) => { // boundary after a crash + CAS claim. Forward it once to the // client as a data-resumed part so the UI can drop the // interrupted attempt's text parts (tools keep their cards). - if (raw?.type === 'response.resumed') { - console.log( - `[chat][onChunk] saw response.resumed raw event attempt=${raw?.attempt} writer_ready=${!!writerRef.current}`, - ); - } if (raw?.type === 'response.resumed' && writerRef.current) { const attempt = typeof raw?.attempt === 'number' ? raw.attempt : 2; if (!emittedResumedAttempts.has(attempt)) { @@ -320,16 +304,9 @@ chatRouter.post('/', requireAuth, async (req: Request, res: Response) => { type: 'data-resumed', data: { attempt }, }); - console.log( - `[chat][onChunk] forwarded data-resumed attempt=${attempt} to UI stream`, - ); } catch (e) { console.warn('[chat] failed to forward data-resumed:', e); } - } else { - console.log( - `[chat][onChunk] already forwarded data-resumed for attempt=${attempt}, skipping`, - ); } } } From 20f87cbcd78ca823bc956321d4428727e4102004 Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Tue, 21 Apr 2026 01:35:08 +0000 Subject: [PATCH 21/47] Use library-side durable-resume repair helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the per-template workarounds for mid-tool crash-resume into the databricks-ai-bridge library and wire them in: - agent-openai-advanced/utils.py: deduplicate_input now calls session.repair() (new public method on AsyncDatabricksSession) instead of the 100-line in-template _sanitize_session. Same behavior — dedupe function_call/function_call_output by call_id, inject synthetic outputs for orphans — just owned by the library. - agent-langgraph-advanced/agent.py: before agent.astream, call build_tool_resume_repair on the checkpointer's messages and apply via agent.aupdate_state(..., as_node="tools"). The as_node is critical — without it LangGraph re-evaluates the model→{tools,END} branch from the updated state and crashes with KeyError: 'model'. - agent-langgraph-advanced/agent.py: when the checkpointer already has a thread, only forward the latest user turn from request.input — the UI client (Vercel AI SDK) re-echoes the full history on every turn, which can re-inject orphan tool_uses from a previously-interrupted attempt that the client kept in its buffer. Both pyproject.toml files now pin databricks-openai / databricks-langchain to the same ai-bridge branch (subdirectory git sources) so the new helpers are picked up. Temporary; revert to registry once the bridge PR merges. Co-authored-by: Isaac --- .../agent_server/agent.py | 40 +++++- agent-langgraph-advanced/pyproject.toml | 1 + agent-openai-advanced/agent_server/utils.py | 122 +----------------- agent-openai-advanced/pyproject.toml | 1 + 4 files changed, 41 insertions(+), 123 deletions(-) diff --git a/agent-langgraph-advanced/agent_server/agent.py b/agent-langgraph-advanced/agent_server/agent.py index 98e121b9..a39d3e72 100644 --- a/agent-langgraph-advanced/agent_server/agent.py +++ b/agent-langgraph-advanced/agent_server/agent.py @@ -4,7 +4,7 @@ import mlflow from databricks.sdk import WorkspaceClient -from databricks_langchain import ChatDatabricks +from databricks_langchain import ChatDatabricks, build_tool_resume_repair from fastapi import HTTPException from langchain.agents import create_agent from langchain_core.messages import AnyMessage @@ -110,10 +110,7 @@ async def stream_handler( if user_id: config["configurable"]["user_id"] = user_id - input_state: dict[str, Any] = { - "messages": to_chat_completions_input([i.model_dump() for i in request.input]), - "custom_inputs": dict(request.custom_inputs or {}), - } + input_messages = to_chat_completions_input([i.model_dump() for i in request.input]) try: async with lakebase_context(LAKEBASE_CONFIG) as (checkpointer, store): @@ -123,6 +120,39 @@ async def stream_handler( # For on-behalf-of user authentication, pass get_user_workspace_client() to init_agent. agent = await init_agent(store=store, checkpointer=checkpointer) + # Durable-resume repair: a kill mid-tool leaves an AIMessage with + # tool_calls whose ToolMessage responses never landed. Inject + # synthetic ToolMessages (as_node="tools" so the graph transitions + # tools → model next, not re-evaluating the model→{tools,END} + # conditional branch which doesn't know "model" as a destination). + # No-op when state is clean. + state = await agent.aget_state(config) + has_history = bool(state and state.values.get("messages")) + if has_history: + repair = build_tool_resume_repair(state.values["messages"]) + if repair: + await agent.aupdate_state( + config, {"messages": repair}, as_node="tools" + ) + + # If the thread has history in the checkpointer, only forward the + # latest user turn — prior turns already live in state. Echoing the + # full conversation (common from UI clients) can re-inject orphan + # tool_uses left over in the client's buffer from a previously + # interrupted attempt, tripping Anthropic's tool_use → tool_result + # pairing check on the next LLM call. + if has_history and input_messages: + last_user = next( + (m for m in reversed(input_messages) if m.get("role") == "user"), + None, + ) + input_messages = [last_user] if last_user else [] + + input_state: dict[str, Any] = { + "messages": input_messages, + "custom_inputs": dict(request.custom_inputs or {}), + } + async for event in process_agent_astream_events( agent.astream(input_state, config, stream_mode=["updates", "messages"]) ): diff --git a/agent-langgraph-advanced/pyproject.toml b/agent-langgraph-advanced/pyproject.toml index 6c9f2256..cf326392 100644 --- a/agent-langgraph-advanced/pyproject.toml +++ b/agent-langgraph-advanced/pyproject.toml @@ -43,6 +43,7 @@ default-groups = ["dev", "setup"] # changes are in review. Revert to the registry release once merged. [tool.uv.sources] databricks-ai-bridge = { git = "https://github.com/databricks/databricks-ai-bridge", branch = "dhruv0811/durable-execution-resume" } +databricks-langchain = { git = "https://github.com/databricks/databricks-ai-bridge", branch = "dhruv0811/durable-execution-resume", subdirectory = "integrations/langchain" } [tool.pytest.ini_options] diff --git a/agent-openai-advanced/agent_server/utils.py b/agent-openai-advanced/agent_server/utils.py index 43e6cea2..aa9ae11f 100644 --- a/agent-openai-advanced/agent_server/utils.py +++ b/agent-openai-advanced/agent_server/utils.py @@ -183,120 +183,6 @@ def replace_fake_id(obj: Any, real_id: str) -> Any: return obj -def _item_get(item, key): - if isinstance(item, dict): - return item.get(key) - return getattr(item, key, None) - - -def _item_dict(item): - """Normalize an item to a plain dict for re-persistence.""" - if isinstance(item, dict): - return dict(item) - if hasattr(item, "model_dump"): - return item.model_dump() - return dict(item.__dict__) if hasattr(item, "__dict__") else {} - - -async def _sanitize_session(session: AsyncDatabricksSession) -> None: - """Rebuild the session so the conversation is valid for the next LLM call. - - Two failure modes to handle on each turn: - - 1. **Orphan tool_calls from durable resume.** A kill mid-tool leaves - ``function_call`` items in the session with no matching - ``function_call_output``. The next ``Runner.run`` fails 400 with - 'assistant message with tool_calls must be followed by tool - messages…'. - - 2. **Duplicate items from client history echo.** The Vercel AI SDK - on the frontend re-sends the full conversation in ``request.input`` - every turn. Our ``deduplicate_input`` trims it down to just the - latest user message on the fast path, but on a resumed turn the - SDK can still re-persist prior items (same ``call_id`` appearing - twice). Duplicate ``function_call`` items confuse the LLM API - even when every call_id has *an* output. - - Fix: walk the items in chronological order, dedupe by ``call_id`` for - function_call / function_call_output, and inject a synthetic - ``function_call_output`` immediately after any ``function_call`` whose - matching output isn't present. Clear the session and re-add the - sanitized list so positional ordering is restored (SQLAlchemySession's - ``add_items`` only appends). - - No-op if the session is already clean. - """ - items = await session.get_items() - if not items: - return - - # First pass: collect call_ids that have outputs anywhere in the history. - call_ids_with_output: set[str] = set() - for item in items: - if _item_get(item, "type") == "function_call_output": - cid = _item_get(item, "call_id") - if cid: - call_ids_with_output.add(cid) - - # Second pass: build the canonical sequence. Dedup function_call / - # function_call_output by call_id, insert synthetic outputs where - # missing, keep messages / other items as-is. - sanitized: list[dict] = [] - seen_calls: set[str] = set() - seen_outputs: set[str] = set() - needed_injection: list[str] = [] - - for item in items: - t = _item_get(item, "type") - cid = _item_get(item, "call_id") - if t == "function_call" and cid: - if cid in seen_calls: - continue # drop duplicate - seen_calls.add(cid) - sanitized.append(_item_dict(item)) - # If this function_call has no matching output anywhere in the - # session, inject a synthetic one immediately after it. - if cid not in call_ids_with_output: - name = _item_get(item, "name") or "" - sanitized.append( - { - "type": "function_call_output", - "call_id": cid, - "output": ( - f"Tool call '{name}' was interrupted by a durable " - "resume and did not complete. Please retry if " - "still needed." - ), - } - ) - needed_injection.append(cid) - elif t == "function_call_output" and cid: - if cid in seen_outputs: - continue # drop duplicate output - seen_outputs.add(cid) - sanitized.append(_item_dict(item)) - else: - sanitized.append(_item_dict(item)) - - # If the sanitized sequence equals the original (same count, no orphans, - # no duplicates), skip the clear+rebuild — it's a no-op and saves DB work. - if len(sanitized) == len(items) and not needed_injection: - return - - logger.info( - "Sanitizing session %s: original=%d items, sanitized=%d items, " - "synthetic outputs injected=%d (call_ids=%s)", - session.session_id, - len(items), - len(sanitized), - len(needed_injection), - needed_injection, - ) - await session.clear_session() - if sanitized: - await session.add_items(sanitized) - - async def deduplicate_input(request: ResponsesAgentRequest, session: AsyncDatabricksSession) -> list[dict]: """Return the input messages to pass to the Runner, avoiding duplication with session history. @@ -305,11 +191,11 @@ async def deduplicate_input(request: ResponsesAgentRequest, session: AsyncDatabr If the session already covers the prior turns, only the latest message is needed since the session will prepend the full history automatically. - Also sanitizes the session (dedupes duplicate items and injects synthetic - outputs for orphan tool_calls left behind by a durable-resume interrupt) - so the next LLM request over this session is a valid conversation. + Also repairs the session (dedupes items and injects synthetic outputs for + orphan tool_calls left behind by a durable-resume interrupt) so the next + LLM request over this session is a valid conversation. """ - await _sanitize_session(session) + await session.repair() messages = [i.model_dump() for i in request.input] # Empty input is a valid signal from the long-running server to resume an # existing session without re-sending user content — the session already diff --git a/agent-openai-advanced/pyproject.toml b/agent-openai-advanced/pyproject.toml index 4a162250..87f62deb 100644 --- a/agent-openai-advanced/pyproject.toml +++ b/agent-openai-advanced/pyproject.toml @@ -47,6 +47,7 @@ default-groups = ["dev", "setup"] # changes are in review. Revert to the registry release once merged. [tool.uv.sources] databricks-ai-bridge = { git = "https://github.com/databricks/databricks-ai-bridge", branch = "dhruv0811/durable-execution-resume" } +databricks-openai = { git = "https://github.com/databricks/databricks-ai-bridge", branch = "dhruv0811/durable-execution-resume", subdirectory = "integrations/openai" } [tool.pytest.ini_options] base_url = "http://localhost:8000" From 0374ff45b3507f69e3b2fb21dfef081b24b3d0c0 Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Tue, 21 Apr 2026 20:59:54 +0000 Subject: [PATCH 22/47] Simplify: langgraph repair via middleware, UI minimal reset on resume MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Library side (databricks-langchain, PR #416): - New build_tool_resume_repair_middleware() returns an AgentMiddleware whose before_model hook runs build_tool_resume_repair. Swaps the manual aget_state / aupdate_state(as_node="tools") surgery in the template for a one-line `middleware=[...]` arg to create_agent. - The as_node="tools" footgun (KeyError: 'model' in the model→{tools,END} conditional branch re-eval) disappears entirely; repair runs inside the graph's own execution flow, not as external state surgery. Template (agent-langgraph-advanced): - init_agent: add middleware=[build_tool_resume_repair_middleware()] to create_agent. stream_handler drops the 8-line repair block. - utils.py process_agent_astream_events: skip None node_data (the graph's updates stream emits {middleware_node: None} when the middleware is a no-op, which is every turn on the happy path). UI (e2e-chatbot-app-next): - On data-resumed from the backend, wipe text parts from the last assistant message in one setMessages. Tool-call parts are kept as-is (they already dedupe across attempts by call_id). Dropped: * attempt1TextLen state + per-message snapshot in onData * render-time text slice in Messages.tsx * onFinish authoritative post-stream truncate The AI SDK's seal-on-resume synthesis (Express proxy) still creates a fresh output_item_id for attempt 2, so new deltas land in a fresh text part — our wipe of the old text part is sufficient. Net: -99 LOC across 4 files. Same behavior for the "delete old text, leave tools alone" UX; substantially less state-machine choreography. Co-authored-by: Isaac --- .../agent_server/agent.py | 25 ++--- .../agent_server/utils.py | 3 + .../client/src/components/chat.tsx | 96 +++---------------- .../client/src/components/messages.tsx | 27 +----- 4 files changed, 26 insertions(+), 125 deletions(-) diff --git a/agent-langgraph-advanced/agent_server/agent.py b/agent-langgraph-advanced/agent_server/agent.py index a39d3e72..91714656 100644 --- a/agent-langgraph-advanced/agent_server/agent.py +++ b/agent-langgraph-advanced/agent_server/agent.py @@ -4,7 +4,7 @@ import mlflow from databricks.sdk import WorkspaceClient -from databricks_langchain import ChatDatabricks, build_tool_resume_repair +from databricks_langchain import ChatDatabricks, build_tool_resume_repair_middleware from fastapi import HTTPException from langchain.agents import create_agent from langchain_core.messages import AnyMessage @@ -78,6 +78,11 @@ async def init_agent( checkpointer=checkpointer, store=store, state_schema=StatefulAgentState, + # Durable-resume repair: a kill mid-tool leaves an AIMessage with + # tool_calls whose ToolMessage responses never landed. This + # middleware injects synthetic ToolMessages before every model + # call; no-op on the happy path. + middleware=[build_tool_resume_repair_middleware()], ) @@ -120,28 +125,14 @@ async def stream_handler( # For on-behalf-of user authentication, pass get_user_workspace_client() to init_agent. agent = await init_agent(store=store, checkpointer=checkpointer) - # Durable-resume repair: a kill mid-tool leaves an AIMessage with - # tool_calls whose ToolMessage responses never landed. Inject - # synthetic ToolMessages (as_node="tools" so the graph transitions - # tools → model next, not re-evaluating the model→{tools,END} - # conditional branch which doesn't know "model" as a destination). - # No-op when state is clean. - state = await agent.aget_state(config) - has_history = bool(state and state.values.get("messages")) - if has_history: - repair = build_tool_resume_repair(state.values["messages"]) - if repair: - await agent.aupdate_state( - config, {"messages": repair}, as_node="tools" - ) - # If the thread has history in the checkpointer, only forward the # latest user turn — prior turns already live in state. Echoing the # full conversation (common from UI clients) can re-inject orphan # tool_uses left over in the client's buffer from a previously # interrupted attempt, tripping Anthropic's tool_use → tool_result # pairing check on the next LLM call. - if has_history and input_messages: + state = await agent.aget_state(config) + if state and state.values.get("messages") and input_messages: last_user = next( (m for m in reversed(input_messages) if m.get("role") == "user"), None, diff --git a/agent-langgraph-advanced/agent_server/utils.py b/agent-langgraph-advanced/agent_server/utils.py index 75b92de2..0798ae48 100644 --- a/agent-langgraph-advanced/agent_server/utils.py +++ b/agent-langgraph-advanced/agent_server/utils.py @@ -210,6 +210,9 @@ def _end_turn(): elif event[0] == "updates": for node_data in event[1].values(): + # Middleware nodes that no-op return None; skip those cleanly. + if not node_data: + continue messages = node_data.get("messages", []) if not messages: continue diff --git a/e2e-chatbot-app-next/client/src/components/chat.tsx b/e2e-chatbot-app-next/client/src/components/chat.tsx index a9a9b38e..ed2663f1 100644 --- a/e2e-chatbot-app-next/client/src/components/chat.tsx +++ b/e2e-chatbot-app-next/client/src/components/chat.tsx @@ -69,20 +69,6 @@ export function Chat({ const resumeAttemptCountRef = useRef(0); const maxResumeAttempts = 3; - // Durable-resume snapshot: on data-resumed we record the length of the - // assistant message's text at that instant. Used two ways: - // (1) at RENDER time, Messages slices the first N chars off the text - // parts of the active assistant message so mid-stream the UI shows - // only attempt-2 content (state stays as AI SDK writes it). - // (2) onFinish runs a final setMessages truncate so the persisted - // message also reflects only attempt 2 (would otherwise be saved - // with both attempts concatenated). - // Needs to be state (not ref) so the map-time slice in Messages re-runs - // when the snapshot changes. Keyed by message id for multi-turn safety. - const [attempt1TextLen, setAttempt1TextLen] = useState< - Record - >({}); - const abortController = useRef(new AbortController()); useEffect(() => { return () => { @@ -196,40 +182,21 @@ export function Chat({ fetchChatHistory(); } // Durable-resume visual reset: when the backend's LongRunningAgentServer - // emits response.resumed (mid-stream pod crash + reclaim), the chat route - // writes a data-resumed part to signal us. Drop any text parts we've - // accumulated from the interrupted attempt so only the new attempt's - // text renders in-place. Tool parts are kept because they naturally - // de-dup across attempts via call_id. + // emits response.resumed (mid-stream pod crash + reclaim), the chat + // route forwards a data-resumed part. Drop text parts from the last + // assistant message so only attempt 2's content renders. Tool parts + // are kept untouched — they dedupe across attempts by call_id. if (dataPart.type === 'data-resumed') { - // Snapshot the current text length across text parts of the last - // assistant message. Messages uses this to slice at render time. - // Mid-stream state mutation is fighting the AI SDK accumulator - // (replaceMessage structuredClones the message on every write()), - // so we transform at the view layer instead. - const lastAssistantId = (() => { - // Peek into current messages via functional setter without mutating. - let captured: { id: string; len: number } | null = null; - setMessages((prev) => { - if (prev.length) { - const last = prev[prev.length - 1]; - if (last.role === 'assistant') { - const currentLen = (last.parts ?? []).reduce( - (acc: number, p: { type?: string; text?: string }) => - p.type === 'text' ? acc + (p.text?.length ?? 0) : acc, - 0, - ); - captured = { id: last.id, len: currentLen }; - } - } - return prev; - }); - return captured; - })(); - if (lastAssistantId) { - const { id, len } = lastAssistantId as { id: string; len: number }; - setAttempt1TextLen((prev) => ({ ...prev, [id]: len })); - } + setMessages((prev) => { + if (!prev.length) return prev; + const last = prev[prev.length - 1]; + if (last.role !== 'assistant') return prev; + const keptParts = (last.parts ?? []).filter( + (p: { type?: string }) => p.type !== 'text', + ); + if (keptParts.length === (last.parts ?? []).length) return prev; + return [...prev.slice(0, -1), { ...last, parts: keptParts }]; + }); } }, onFinish: ({ @@ -241,40 +208,6 @@ export function Chat({ didFetchHistoryOnNewChat.current = false; setTitlePending(false); - // Post-stream durable-resume truncation. Persists attempt-2-only text - // into useChat's messages state (what gets saved to DB). The render- - // time slice in Messages handles mid-stream; this handles the final - // committed state. - const lastAssistant = finishedMessages?.at(-1); - if (lastAssistant && lastAssistant.role === 'assistant') { - const drop = attempt1TextLen[lastAssistant.id]; - if (drop && drop > 0) { - setMessages((prev) => { - if (!prev.length) return prev; - const last = prev[prev.length - 1]; - if (last.id !== lastAssistant.id) return prev; - let remaining = drop; - const newParts = (last.parts ?? []).map((p) => { - const tp = p as { type?: string; text?: string }; - if (tp.type !== 'text' || !tp.text) return p; - if (remaining <= 0) return p; - const cut = Math.min(remaining, tp.text.length); - const nextText = tp.text.slice(cut); - remaining -= cut; - return { ...tp, text: nextText }; - }); - const newLast = { ...last, parts: newParts } as ChatMessage; - return [...prev.slice(0, -1), newLast]; - }); - // Clear so the render-time slice in Messages stops kicking in - // (state is now already truncated). - setAttempt1TextLen((prev) => { - const { [lastAssistant.id]: _omit, ...rest } = prev; - return rest; - }); - } - } - // If user aborted, don't try to resume if (isAbort) { console.log('[Chat onFinish] Stream was aborted by user, not resuming'); @@ -411,7 +344,6 @@ export function Chat({ isReadonly={isReadonly} selectedModelId={initialChatModel} feedback={feedback} - attempt1TextLen={attempt1TextLen} /> diff --git a/e2e-chatbot-app-next/client/src/components/messages.tsx b/e2e-chatbot-app-next/client/src/components/messages.tsx index aa6a5f9e..4c125d88 100644 --- a/e2e-chatbot-app-next/client/src/components/messages.tsx +++ b/e2e-chatbot-app-next/client/src/components/messages.tsx @@ -18,12 +18,6 @@ interface MessagesProps { isReadonly: boolean; selectedModelId: string; feedback?: FeedbackMap; - // Durable-resume: messageId -> chars of text that belonged to attempt 1. - // When present, slice that many chars off the front of the message's text - // parts at render time so only attempt-2 content shows. State itself is - // left alone because the AI SDK accumulator keeps restoring the full - // text via structuredClone on every write(). - attempt1TextLen?: Record; } function PureMessages({ @@ -36,7 +30,6 @@ function PureMessages({ isReadonly, selectedModelId, feedback = {}, - attempt1TextLen, }: MessagesProps) { const { containerRef: messagesContainerRef, @@ -73,28 +66,10 @@ function PureMessages({ {messages.map((message, index) => { - // Render-time durable-resume slice: if this message had a resume - // boundary recorded, remove the leading attempt-1 chars from its - // text parts before passing to PreviewMessage. Creates a new - // message object so the memo sees a reference change and - // re-renders. Tool / step / data parts are passed through. - let displayMessage = message; - const drop = attempt1TextLen?.[message.id]; - if (drop && drop > 0) { - let remaining = drop; - const newParts = (message.parts ?? []).map((p) => { - const tp = p as { type?: string; text?: string }; - if (tp.type !== 'text' || !tp.text || remaining <= 0) return p; - const cut = Math.min(remaining, tp.text.length); - remaining -= cut; - return { ...tp, text: tp.text.slice(cut) }; - }); - displayMessage = { ...message, parts: newParts } as ChatMessage; - } return ( Date: Tue, 21 Apr 2026 21:22:34 +0000 Subject: [PATCH 23/47] debug: log response.resumed detection in chat.ts onChunk --- e2e-chatbot-app-next/server/src/routes/chat.ts | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/e2e-chatbot-app-next/server/src/routes/chat.ts b/e2e-chatbot-app-next/server/src/routes/chat.ts index 8f4ee01a..d0b7be82 100644 --- a/e2e-chatbot-app-next/server/src/routes/chat.ts +++ b/e2e-chatbot-app-next/server/src/routes/chat.ts @@ -295,6 +295,11 @@ chatRouter.post('/', requireAuth, async (req: Request, res: Response) => { // boundary after a crash + CAS claim. Forward it once to the // client as a data-resumed part so the UI can drop the // interrupted attempt's text parts (tools keep their cards). + if (raw?.type === 'response.resumed') { + console.log( + `[chat][onChunk] saw response.resumed attempt=${raw?.attempt} writer_ready=${!!writerRef.current}`, + ); + } if (raw?.type === 'response.resumed' && writerRef.current) { const attempt = typeof raw?.attempt === 'number' ? raw.attempt : 2; if (!emittedResumedAttempts.has(attempt)) { @@ -304,6 +309,9 @@ chatRouter.post('/', requireAuth, async (req: Request, res: Response) => { type: 'data-resumed', data: { attempt }, }); + console.log( + `[chat][onChunk] forwarded data-resumed attempt=${attempt}`, + ); } catch (e) { console.warn('[chat] failed to forward data-resumed:', e); } From 337c39f12d7f26687c4c6d6399d8d75b70833ca2 Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Tue, 21 Apr 2026 21:30:59 +0000 Subject: [PATCH 24/47] debug: log every dataPart in chat.tsx onData to diagnose UI drop --- e2e-chatbot-app-next/client/src/components/chat.tsx | 1 + 1 file changed, 1 insertion(+) diff --git a/e2e-chatbot-app-next/client/src/components/chat.tsx b/e2e-chatbot-app-next/client/src/components/chat.tsx index ed2663f1..112fac3b 100644 --- a/e2e-chatbot-app-next/client/src/components/chat.tsx +++ b/e2e-chatbot-app-next/client/src/components/chat.tsx @@ -170,6 +170,7 @@ export function Chat({ }, }), onData: (dataPart) => { + console.log(`[chat][onData] dataPart.type=${dataPart.type}`, dataPart); setDataStream((ds) => ds ? [...ds, dataPart as DataUIPart] : [], ); From d144ec08f70a905cda27163ae8b84e20c465d75c Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Tue, 21 Apr 2026 21:36:54 +0000 Subject: [PATCH 25/47] debug: log setMessages wipe details on data-resumed --- .../client/src/components/chat.tsx | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/e2e-chatbot-app-next/client/src/components/chat.tsx b/e2e-chatbot-app-next/client/src/components/chat.tsx index 112fac3b..779b5893 100644 --- a/e2e-chatbot-app-next/client/src/components/chat.tsx +++ b/e2e-chatbot-app-next/client/src/components/chat.tsx @@ -189,13 +189,23 @@ export function Chat({ // are kept untouched — they dedupe across attempts by call_id. if (dataPart.type === 'data-resumed') { setMessages((prev) => { - if (!prev.length) return prev; + if (!prev.length) { + console.log('[chat][resumed] no messages, noop'); + return prev; + } const last = prev[prev.length - 1]; - if (last.role !== 'assistant') return prev; - const keptParts = (last.parts ?? []).filter( + if (last.role !== 'assistant') { + console.log('[chat][resumed] last msg not assistant, noop', last.role); + return prev; + } + const origParts = last.parts ?? []; + const keptParts = origParts.filter( (p: { type?: string }) => p.type !== 'text', ); - if (keptParts.length === (last.parts ?? []).length) return prev; + console.log( + `[chat][resumed] msg=${last.id} parts_before=${origParts.length} parts_after=${keptParts.length} types_before=${JSON.stringify(origParts.map((p: any) => p.type))}`, + ); + if (keptParts.length === origParts.length) return prev; return [...prev.slice(0, -1), { ...last, parts: keptParts }]; }); } From 467d2ed7dd1537d34d69b3b319984d3a0221d9a1 Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Tue, 21 Apr 2026 21:47:34 +0000 Subject: [PATCH 26/47] UI: render-time slice for durable-resume text wipe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit setMessages can't wipe mid-stream — the AI SDK's activeResponse.state is a snapshot taken at makeRequest time, and every text-delta calls write() → this.state.replaceMessage(lastIdx, activeResponse.state.message), which overwrites any setMessages we do. Our wipe was visible for a single chunk then reverted. Fix: snapshot the assistant message's parts.length at data-resumed, and at render time hide text parts at indices BEFORE that cutoff. Tool / step parts render normally at every index. Works for openai and langgraph because it transforms at the view layer rather than fighting the AI SDK state machine. Removes server-side debug log. Keeps the minimal delete-old-text UX. Co-authored-by: Isaac --- .../client/src/components/chat.tsx | 43 ++++++++++--------- .../client/src/components/messages.tsx | 21 ++++++++- .../server/src/routes/chat.ts | 8 ---- 3 files changed, 42 insertions(+), 30 deletions(-) diff --git a/e2e-chatbot-app-next/client/src/components/chat.tsx b/e2e-chatbot-app-next/client/src/components/chat.tsx index 779b5893..69c2e6ed 100644 --- a/e2e-chatbot-app-next/client/src/components/chat.tsx +++ b/e2e-chatbot-app-next/client/src/components/chat.tsx @@ -69,6 +69,13 @@ export function Chat({ const resumeAttemptCountRef = useRef(0); const maxResumeAttempts = 3; + // Durable-resume render-time slice: messageId → index in parts[] to cut at. + // Text parts BEFORE this index (attempt-1 text) are hidden at render time. + // Tool / step parts pass through regardless so they keep showing. + const [resumeCutIndex, setResumeCutIndex] = useState>( + {}, + ); + const abortController = useRef(new AbortController()); useEffect(() => { return () => { @@ -170,7 +177,6 @@ export function Chat({ }, }), onData: (dataPart) => { - console.log(`[chat][onData] dataPart.type=${dataPart.type}`, dataPart); setDataStream((ds) => ds ? [...ds, dataPart as DataUIPart] : [], ); @@ -183,30 +189,24 @@ export function Chat({ fetchChatHistory(); } // Durable-resume visual reset: when the backend's LongRunningAgentServer - // emits response.resumed (mid-stream pod crash + reclaim), the chat - // route forwards a data-resumed part. Drop text parts from the last - // assistant message so only attempt 2's content renders. Tool parts - // are kept untouched — they dedupe across attempts by call_id. + // emits response.resumed, snapshot the length of the last assistant + // message's parts array. Messages renders each message through a + // render-time slice that HIDES text parts at indices before this + // cutoff (attempt-1 text). Tool parts pass through at any index so + // they keep showing. setMessages can't wipe mid-stream because the + // AI SDK's activeResponse.state.message (snapshot taken at request + // start) overwrites it on the next chunk via write() → + // state.replaceMessage; render-time transform sidesteps that. if (dataPart.type === 'data-resumed') { setMessages((prev) => { - if (!prev.length) { - console.log('[chat][resumed] no messages, noop'); - return prev; - } const last = prev[prev.length - 1]; - if (last.role !== 'assistant') { - console.log('[chat][resumed] last msg not assistant, noop', last.role); - return prev; + if (last?.role === 'assistant') { + setResumeCutIndex((s) => ({ + ...s, + [last.id]: (last.parts ?? []).length, + })); } - const origParts = last.parts ?? []; - const keptParts = origParts.filter( - (p: { type?: string }) => p.type !== 'text', - ); - console.log( - `[chat][resumed] msg=${last.id} parts_before=${origParts.length} parts_after=${keptParts.length} types_before=${JSON.stringify(origParts.map((p: any) => p.type))}`, - ); - if (keptParts.length === origParts.length) return prev; - return [...prev.slice(0, -1), { ...last, parts: keptParts }]; + return prev; }); } }, @@ -355,6 +355,7 @@ export function Chat({ isReadonly={isReadonly} selectedModelId={initialChatModel} feedback={feedback} + resumeCutIndex={resumeCutIndex} /> diff --git a/e2e-chatbot-app-next/client/src/components/messages.tsx b/e2e-chatbot-app-next/client/src/components/messages.tsx index 4c125d88..a0c3748a 100644 --- a/e2e-chatbot-app-next/client/src/components/messages.tsx +++ b/e2e-chatbot-app-next/client/src/components/messages.tsx @@ -18,6 +18,10 @@ interface MessagesProps { isReadonly: boolean; selectedModelId: string; feedback?: FeedbackMap; + // Durable-resume render-time slice: messageId → parts[] index. Text parts + // at indices BEFORE this value are hidden (attempt-1 text); everything else + // renders normally. Tool / step parts are never hidden. + resumeCutIndex?: Record; } function PureMessages({ @@ -30,6 +34,7 @@ function PureMessages({ isReadonly, selectedModelId, feedback = {}, + resumeCutIndex, }: MessagesProps) { const { containerRef: messagesContainerRef, @@ -66,10 +71,24 @@ function PureMessages({ {messages.map((message, index) => { + // Render-time slice: if this message saw a durable-resume boundary, + // hide text parts at indices before the cutoff so only attempt-2 + // text shows. Tool / step parts are kept at every index. + const cut = resumeCutIndex?.[message.id]; + const displayMessage = + cut != null && cut > 0 + ? ({ + ...message, + parts: (message.parts ?? []).filter( + (p: { type?: string }, i: number) => + (p.type !== 'text' || i >= cut), + ), + } as ChatMessage) + : message; return ( { // boundary after a crash + CAS claim. Forward it once to the // client as a data-resumed part so the UI can drop the // interrupted attempt's text parts (tools keep their cards). - if (raw?.type === 'response.resumed') { - console.log( - `[chat][onChunk] saw response.resumed attempt=${raw?.attempt} writer_ready=${!!writerRef.current}`, - ); - } if (raw?.type === 'response.resumed' && writerRef.current) { const attempt = typeof raw?.attempt === 'number' ? raw.attempt : 2; if (!emittedResumedAttempts.has(attempt)) { @@ -309,9 +304,6 @@ chatRouter.post('/', requireAuth, async (req: Request, res: Response) => { type: 'data-resumed', data: { attempt }, }); - console.log( - `[chat][onChunk] forwarded data-resumed attempt=${attempt}`, - ); } catch (e) { console.warn('[chat] failed to forward data-resumed:', e); } From 695dcfb9aec79a7563ab3a1047206365afa75513 Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Tue, 21 Apr 2026 23:03:20 +0000 Subject: [PATCH 27/47] Simplify /invocations proxy: drop interruption suffix + writeEvent helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Removed the "_(connection interrupted — reconnecting…)_" delta block. Render-time slice hides attempt-1 text on resume anyway, so the suffix was invisible past the 10s stale window and too subtle during it. - Extracted writeEvent(type, payload) helper; sealActiveMessage went from 45 → 22 lines, no behavior change. - Removed readActive() TS-widening helper (no longer needed without the suffix block). - Inlined onFirstResponseId helper into its single call site. Net: 92 lines removed, 36 added in this file. Co-authored-by: Isaac --- e2e-chatbot-app-next/server/src/index.ts | 128 +++++++---------------- 1 file changed, 36 insertions(+), 92 deletions(-) diff --git a/e2e-chatbot-app-next/server/src/index.ts b/e2e-chatbot-app-next/server/src/index.ts index 9081fec9..95ea7879 100644 --- a/e2e-chatbot-app-next/server/src/index.ts +++ b/e2e-chatbot-app-next/server/src/index.ts @@ -137,22 +137,15 @@ if (agentBackendUrl) { let responseId: string | null = null; let lastSeq = 0; let sawDone = false; - // Terminal-error flag: if the backend emits a task_failed error event - // (e.g. upstream LLM returned 502, task_timeout, permanent failure), - // exit the resume loop instead of hammering retrieve N more times. + // Terminal-error flag: task_failed / task_timeout from upstream. let sawTerminalError = false; - const onFirstResponseId = (rid: string) => { - console.log(`[/invocations] background started response_id=${rid}`); - }; // Safety cap so a permanently-broken backend can't loop forever. const MAX_RESUME_ATTEMPTS = 10; let resumeAttempt = 0; - // Tracks the in-progress assistant message item across the stream so we - // can emit synthetic closure events on resume. Without this, attempt 1's - // partial text bubble keeps receiving deltas from attempt 2's fresh - // generation — the client sees attempt-1-partial + attempt-2-full - // concatenated. Sealing the partial turns it into its own bubble. + // Tracks the in-progress assistant message item so we can emit + // synthetic closure events on resume. Without this, attempt 2's + // deltas append to attempt 1's text part in the AI SDK's state. type ActiveMessage = { itemId: string; outputIndex: number; @@ -161,53 +154,32 @@ if (agentBackendUrl) { }; let activeMessage: ActiveMessage | null = null; - // Emit the closure events that finalize activeMessage with the text - // we've accumulated so far plus a short 'resuming…' suffix. Tunes - // OpenAI Responses API semantics enough for the Vercel AI SDK to - // treat it as a completed message — the next output_item.added in - // attempt 2 then starts a fresh assistant bubble. - const sealActiveMessage = (suffix: string) => { + const writeEvent = (type: string, payload: Record) => { + res.write(`event: ${type}\ndata: ${JSON.stringify({ type, ...payload })}\n\n`); + }; + + // Emit content_part.done + output_item.done for the active message so + // the Vercel AI SDK finalizes its text part and starts a fresh one on + // attempt 2's next output_item.added. + const sealActiveMessage = () => { if (!activeMessage) return; - const finalText = activeMessage.text + suffix; - if (suffix) { - res.write( - `event: response.output_text.delta\ndata: ${JSON.stringify({ - type: 'response.output_text.delta', - item_id: activeMessage.itemId, - output_index: activeMessage.outputIndex, - content_index: activeMessage.contentIndex, - delta: suffix, - })}\n\n`, - ); - } - res.write( - `event: response.content_part.done\ndata: ${JSON.stringify({ - type: 'response.content_part.done', - item_id: activeMessage.itemId, - output_index: activeMessage.outputIndex, - content_index: activeMessage.contentIndex, - part: { - type: 'output_text', - text: finalText, - annotations: [], - }, - })}\n\n`, - ); - res.write( - `event: response.output_item.done\ndata: ${JSON.stringify({ - type: 'response.output_item.done', - output_index: activeMessage.outputIndex, - item: { - id: activeMessage.itemId, - type: 'message', - role: 'assistant', - status: 'completed', - content: [ - { type: 'output_text', text: finalText, annotations: [] }, - ], - }, - })}\n\n`, - ); + const { itemId, outputIndex, contentIndex, text } = activeMessage; + writeEvent('response.content_part.done', { + item_id: itemId, + output_index: outputIndex, + content_index: contentIndex, + part: { type: 'output_text', text, annotations: [] }, + }); + writeEvent('response.output_item.done', { + output_index: outputIndex, + item: { + id: itemId, + type: 'message', + role: 'assistant', + status: 'completed', + content: [{ type: 'output_text', text, annotations: [] }], + }, + }); activeMessage = null; }; @@ -263,7 +235,7 @@ if (agentBackendUrl) { : undefined); if (!responseId && typeof rid === 'string') { responseId = rid; - onFirstResponseId(responseId); + console.log(`[/invocations] background started response_id=${responseId}`); } if ( typeof parsed.sequence_number === 'number' && @@ -299,17 +271,11 @@ if (agentBackendUrl) { // closure. activeMessage = null; } - // On the resume sentinel, seal any active message BEFORE forwarding - // the sentinel itself. Subsequent attempt-2 events will naturally - // emit a fresh output_item.added with a new item_id — the client - // sees a clean second assistant bubble. The interruption suffix - // was already appended when the upstream first closed (see the - // auto-resume loop below); seal with no additional suffix. + // On the resume sentinel, seal the active message before + // forwarding the sentinel. Attempt 2 emits a fresh output_item.added + // with a new id, so the AI SDK starts a clean text part for it. if (eventType === 'response.resumed' && activeMessage) { - console.log( - `[/invocations] sealing interrupted message item=${activeMessage.itemId} text_len=${activeMessage.text.length}`, - ); - sealActiveMessage(''); + sealActiveMessage(); } // Detect terminal errors (task_failed, task_timeout, etc.) so we // don't burn MAX_RESUME_ATTEMPTS fetching a response that will @@ -347,34 +313,12 @@ if (agentBackendUrl) { } sawDone = await pumpStream(initial); - // Auto-resume loop: if upstream closed early (pod crash) and we know a - // response_id, reconnect via the retrieve endpoint using our cursor. + // Auto-resume loop: if upstream closed early and we have a response_id, + // reconnect via the retrieve endpoint using our cursor. if (!sawDone && responseId) { console.log( `[/invocations] upstream closed without [DONE] response_id=${responseId} last_seq=${lastSeq}; entering auto-resume`, ); - // Surface the interruption to the user right away — otherwise they'd - // see the partial text sit frozen for ~10s until the stale threshold - // expires and the backend emits response.resumed. The seal-on-resume - // path below will also append text if sentinel arrives, but this - // first suffix makes the 'something is happening' signal immediate. - // Wrap in a helper call so TS widens the type back — async functions - // mutating closure variables aren't tracked through ``await`` boundaries. - const readActive = (): ActiveMessage | null => activeMessage; - const am = readActive(); - if (am) { - const suffix = '\n\n_(connection interrupted — reconnecting…)_'; - res.write( - `event: response.output_text.delta\ndata: ${JSON.stringify({ - type: 'response.output_text.delta', - item_id: am.itemId, - output_index: am.outputIndex, - content_index: am.contentIndex, - delta: suffix, - })}\n\n`, - ); - am.text += suffix; - } } while (!sawDone && !sawTerminalError && responseId && resumeAttempt < MAX_RESUME_ATTEMPTS) { resumeAttempt += 1; From 25202c75c9502cf7ad0d15b9dfebf561f410c241 Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Wed, 22 Apr 2026 22:20:58 +0000 Subject: [PATCH 28/47] Advanced templates: strip user-space durability code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Durability mechanics now live entirely in databricks-ai-bridge's LongRunningAgentServer (rotate conv_id on resume + full-history input sanitizer, see ai-bridge PR #416). Templates can drop the explicit repair surface: - agent-langgraph-advanced/agent.py: drop middleware=[build_tool_resume_repair_middleware()] from create_agent and the unused import. Also drop the stream_handler UI-echo dedupe block — the server sanitizer handles mid-history orphans end-to-end. - agent-openai-advanced/utils.py: drop await session.repair() from deduplicate_input. session.repair() stays available as a public method for callers who want destructive session cleanup. Net: agent.py / utils.py in both advanced templates have zero durability-specific lines. The contract becomes "use our checkpointer/ session classes with LongRunningAgentServer — durable resume + orphan repair is free." Co-authored-by: Isaac --- .../agent_server/agent.py | 52 +++++++++------ agent-openai-advanced/agent_server/agent.py | 65 ++++++++++++++++++- agent-openai-advanced/agent_server/utils.py | 8 --- 3 files changed, 95 insertions(+), 30 deletions(-) diff --git a/agent-langgraph-advanced/agent_server/agent.py b/agent-langgraph-advanced/agent_server/agent.py index 91714656..50acb677 100644 --- a/agent-langgraph-advanced/agent_server/agent.py +++ b/agent-langgraph-advanced/agent_server/agent.py @@ -1,10 +1,11 @@ +import asyncio import logging from datetime import datetime from typing import Any, AsyncGenerator, Optional, Sequence, TypedDict import mlflow from databricks.sdk import WorkspaceClient -from databricks_langchain import ChatDatabricks, build_tool_resume_repair_middleware +from databricks_langchain import ChatDatabricks from fastapi import HTTPException from langchain.agents import create_agent from langchain_core.messages import AnyMessage @@ -50,6 +51,34 @@ def get_current_time() -> str: return datetime.now().isoformat() +@tool +def get_weather(city: str) -> str: + """Return a short weather summary for the given city.""" + stubs = { + "new york": "72°F, partly cloudy, light wind", + "los angeles": "78°F, sunny, mild humidity", + "tokyo": "65°F, rain, chance of thunderstorms", + } + return stubs.get(city.lower(), f"70°F, clear skies (stub for {city})") + + +@tool +def get_stock_price(ticker: str) -> str: + """Return a simulated stock price for the given ticker symbol.""" + stubs = {"AAPL": "$187.42 (+1.2%)", "GOOGL": "$141.78 (-0.4%)"} + return stubs.get(ticker.upper(), f"$100.00 (stub for {ticker.upper()})") + + +@tool +async def deep_research(topic: str) -> str: + """Run an in-depth multi-source research on the given topic. Takes ~15 seconds.""" + await asyncio.sleep(15) + return ( + f"Research summary on '{topic}': key findings include " + "historical context, current consensus, and two leading counter-arguments." + ) + + class StatefulAgentState(TypedDict, total=False): messages: Annotated[Sequence[AnyMessage], add_messages] custom_inputs: dict[str, Any] @@ -61,7 +90,7 @@ async def init_agent( workspace_client: Optional[WorkspaceClient] = None, checkpointer: Optional[Any] = None, ): - tools = [get_current_time] + memory_tools() + tools = [get_current_time, get_weather, get_stock_price, deep_research] + memory_tools() # To use MCP server tools instead, uncomment the below lines: # mcp_client = init_mcp_client(workspace_client or sp_workspace_client) # try: @@ -78,11 +107,6 @@ async def init_agent( checkpointer=checkpointer, store=store, state_schema=StatefulAgentState, - # Durable-resume repair: a kill mid-tool leaves an AIMessage with - # tool_calls whose ToolMessage responses never landed. This - # middleware injects synthetic ToolMessages before every model - # call; no-op on the happy path. - middleware=[build_tool_resume_repair_middleware()], ) @@ -125,20 +149,6 @@ async def stream_handler( # For on-behalf-of user authentication, pass get_user_workspace_client() to init_agent. agent = await init_agent(store=store, checkpointer=checkpointer) - # If the thread has history in the checkpointer, only forward the - # latest user turn — prior turns already live in state. Echoing the - # full conversation (common from UI clients) can re-inject orphan - # tool_uses left over in the client's buffer from a previously - # interrupted attempt, tripping Anthropic's tool_use → tool_result - # pairing check on the next LLM call. - state = await agent.aget_state(config) - if state and state.values.get("messages") and input_messages: - last_user = next( - (m for m in reversed(input_messages) if m.get("role") == "user"), - None, - ) - input_messages = [last_user] if last_user else [] - input_state: dict[str, Any] = { "messages": input_messages, "custom_inputs": dict(request.custom_inputs or {}), diff --git a/agent-openai-advanced/agent_server/agent.py b/agent-openai-advanced/agent_server/agent.py index db2272f6..50a927d4 100644 --- a/agent-openai-advanced/agent_server/agent.py +++ b/agent-openai-advanced/agent_server/agent.py @@ -1,3 +1,4 @@ +import asyncio import logging from datetime import datetime from typing import AsyncGenerator @@ -42,6 +43,62 @@ def get_current_time() -> str: return datetime.now().isoformat() +@function_tool +def get_weather(city: str) -> str: + """Return a short weather summary for the given city.""" + # Deterministic stub so durable-resume tests can verify the same tool + # result is reused across attempts rather than regenerated. + stubs = { + "new york": "72°F, partly cloudy, light wind", + "los angeles": "78°F, sunny, mild humidity", + "tokyo": "65°F, rain, chance of thunderstorms", + "paris": "60°F, overcast, occasional drizzle", + "london": "55°F, foggy, light rain", + "sydney": "82°F, sunny, breezy", + } + return stubs.get(city.lower(), f"70°F, clear skies (stub for {city})") + + +@function_tool +def get_stock_price(ticker: str) -> str: + """Return a simulated stock price for the given ticker symbol.""" + stubs = { + "AAPL": "$187.42 (+1.2%)", + "GOOGL": "$141.78 (-0.4%)", + "MSFT": "$415.06 (+0.8%)", + "NVDA": "$885.91 (+2.7%)", + "TSLA": "$204.33 (-1.5%)", + } + return stubs.get(ticker.upper(), f"$100.00 (stub for {ticker.upper()})") + + +@function_tool +def search_best_restaurants(city: str) -> str: + """Find a short list of notable restaurants in the given city.""" + stubs = { + "paris": "Le Comptoir du Relais, Septime, Chez L'Ami Jean", + "tokyo": "Sukiyabashi Jiro, Narisawa, Den", + "new york": "Eleven Madison Park, Le Bernardin, Daniel", + } + return stubs.get( + city.lower(), f"Local favorites in {city}: Cafe One, The Bistro, Riverside Kitchen" + ) + + +@function_tool +async def deep_research(topic: str) -> str: + """Run an in-depth multi-source research on the given topic. Takes ~15 seconds.""" + # Deliberately slow so durable-resume tests have a window to kill the + # agent mid-run and prove that tool results committed before the crash + # are preserved in the OpenAI Session and not re-invoked on resume. + await asyncio.sleep(15) + return ( + f"Research summary on '{topic}': key findings include " + "historical context, current consensus, and two leading " + "counter-arguments. (stubbed 15s simulated research)" + ) + + async def init_mcp_server(workspace_client: WorkspaceClient): return McpServer( url=f"{get_databricks_host_from_env()}/api/2.0/mcp/functions/system/ai", @@ -55,7 +112,13 @@ def create_agent(mcp_servers: list[McpServer] | None = None) -> Agent: name="Agent", instructions="You are a helpful assistant.", model="databricks-gpt-5-2", - tools=[get_current_time], + tools=[ + get_current_time, + get_weather, + get_stock_price, + search_best_restaurants, + deep_research, + ], mcp_servers=mcp_servers or [], ) diff --git a/agent-openai-advanced/agent_server/utils.py b/agent-openai-advanced/agent_server/utils.py index aa9ae11f..59ff774d 100644 --- a/agent-openai-advanced/agent_server/utils.py +++ b/agent-openai-advanced/agent_server/utils.py @@ -190,16 +190,8 @@ async def deduplicate_input(request: ResponsesAgentRequest, session: AsyncDatabr that history persisted, passing everything through would duplicate messages. If the session already covers the prior turns, only the latest message is needed since the session will prepend the full history automatically. - - Also repairs the session (dedupes items and injects synthetic outputs for - orphan tool_calls left behind by a durable-resume interrupt) so the next - LLM request over this session is a valid conversation. """ - await session.repair() messages = [i.model_dump() for i in request.input] - # Empty input is a valid signal from the long-running server to resume an - # existing session without re-sending user content — the session already - # has the prior turns, so there is nothing to deduplicate. if not messages: return [] # Normalize assistant message content from string to structured list format. From a912d8d527b90b73e72b27ecff5529f02f873c41 Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Wed, 22 Apr 2026 23:41:30 +0000 Subject: [PATCH 29/47] debug: disable UI wipe on data-resumed for observing inheritance Temporarily short-circuit the resumeCutIndex write so attempt-1's text stays visible while attempt-2 streams over it. Lets us see how the server-side inheritance + synthetic-output prompt shape the LLM's mid-turn continuation behavior without the visual wipe hiding what attempt-2 actually emits. Re-enable by uncommenting the block; the rest of the wipe plumbing (state hook, Messages prop threading, render-time slice) is left in place so re-enabling is a 1-line flip. Co-authored-by: Isaac --- .../client/src/components/chat.tsx | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/e2e-chatbot-app-next/client/src/components/chat.tsx b/e2e-chatbot-app-next/client/src/components/chat.tsx index 69c2e6ed..dbf7bbfb 100644 --- a/e2e-chatbot-app-next/client/src/components/chat.tsx +++ b/e2e-chatbot-app-next/client/src/components/chat.tsx @@ -198,16 +198,22 @@ export function Chat({ // start) overwrites it on the next chunk via write() → // state.replaceMessage; render-time transform sidesteps that. if (dataPart.type === 'data-resumed') { - setMessages((prev) => { - const last = prev[prev.length - 1]; - if (last?.role === 'assistant') { - setResumeCutIndex((s) => ({ - ...s, - [last.id]: (last.parts ?? []).length, - })); - } - return prev; - }); + // TEMP: UI refresh/wipe on resume is DISABLED for durability testing. + // Without this, attempt-1 text stays on screen while attempt-2 streams + // its (possibly different) text over it — useful for observing how the + // server's attempt-1 inheritance + synthetic-output prompt shape the + // LLM's mid-turn resume behavior. Re-enable by uncommenting below. + // + // setMessages((prev) => { + // const last = prev[prev.length - 1]; + // if (last?.role === 'assistant') { + // setResumeCutIndex((s) => ({ + // ...s, + // [last.id]: (last.parts ?? []).length, + // })); + // } + // return prev; + // }); } }, onFinish: ({ From 44215111203c9406f5551fd85d09887ed17dcdb1 Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Thu, 23 Apr 2026 00:07:54 +0000 Subject: [PATCH 30/47] Remove UI text-refresh plumbing now that server-side inheritance handles resume Server-side changes earlier in this branch (prior-attempt tool-event inheritance + partial-stream reassembly in databricks-ai-bridge) make the client-side "wipe attempt-1 text when resume fires" machinery unnecessary: attempt-2's LLM sees attempt-1's work as history and continues seamlessly instead of restarting. The wipe was also hiding the new continuation quality from the user. Turning the wipe off in UI testing confirmed the server-side story is sufficient. Delete the full stack: - packages/core/src/types.ts: drop `resumed` from CustomUIDataTypes. - server/src/routes/chat.ts: drop writerRef + emittedResumedAttempts + the onChunk raw-event branch that emitted data-resumed parts. Trace-extraction stays; only the resume-forwarding path is removed. - client/src/components/chat.tsx: drop resumeCutIndex state hook, the data-resumed onData handler (was already commented out), and the prop pass to . - client/src/components/messages.tsx: drop resumeCutIndex prop from MessagesProps + its destructuring + the render-time text-part slice. The server still emits `response.resumed` as a sentinel so the Express proxy's sealActiveMessage() call correctly closes attempt-1's open text part before attempt-2's fresh output_item.added creates a new one. The proxy no longer extracts it into a UI data part. Co-authored-by: Isaac --- .../client/src/components/chat.tsx | 37 +------------------ .../client/src/components/messages.tsx | 21 +---------- .../packages/core/src/types.ts | 3 -- .../server/src/routes/chat.ts | 31 ---------------- 4 files changed, 2 insertions(+), 90 deletions(-) diff --git a/e2e-chatbot-app-next/client/src/components/chat.tsx b/e2e-chatbot-app-next/client/src/components/chat.tsx index dbf7bbfb..9e5fd79c 100644 --- a/e2e-chatbot-app-next/client/src/components/chat.tsx +++ b/e2e-chatbot-app-next/client/src/components/chat.tsx @@ -69,13 +69,6 @@ export function Chat({ const resumeAttemptCountRef = useRef(0); const maxResumeAttempts = 3; - // Durable-resume render-time slice: messageId → index in parts[] to cut at. - // Text parts BEFORE this index (attempt-1 text) are hidden at render time. - // Tool / step parts pass through regardless so they keep showing. - const [resumeCutIndex, setResumeCutIndex] = useState>( - {}, - ); - const abortController = useRef(new AbortController()); useEffect(() => { return () => { @@ -188,33 +181,6 @@ export function Chat({ setTitlePending(false); fetchChatHistory(); } - // Durable-resume visual reset: when the backend's LongRunningAgentServer - // emits response.resumed, snapshot the length of the last assistant - // message's parts array. Messages renders each message through a - // render-time slice that HIDES text parts at indices before this - // cutoff (attempt-1 text). Tool parts pass through at any index so - // they keep showing. setMessages can't wipe mid-stream because the - // AI SDK's activeResponse.state.message (snapshot taken at request - // start) overwrites it on the next chunk via write() → - // state.replaceMessage; render-time transform sidesteps that. - if (dataPart.type === 'data-resumed') { - // TEMP: UI refresh/wipe on resume is DISABLED for durability testing. - // Without this, attempt-1 text stays on screen while attempt-2 streams - // its (possibly different) text over it — useful for observing how the - // server's attempt-1 inheritance + synthetic-output prompt shape the - // LLM's mid-turn resume behavior. Re-enable by uncommenting below. - // - // setMessages((prev) => { - // const last = prev[prev.length - 1]; - // if (last?.role === 'assistant') { - // setResumeCutIndex((s) => ({ - // ...s, - // [last.id]: (last.parts ?? []).length, - // })); - // } - // return prev; - // }); - } }, onFinish: ({ isAbort, @@ -336,7 +302,7 @@ export function Chat({ return (
-
+
{inputElement} @@ -361,7 +327,6 @@ export function Chat({ isReadonly={isReadonly} selectedModelId={initialChatModel} feedback={feedback} - resumeCutIndex={resumeCutIndex} /> diff --git a/e2e-chatbot-app-next/client/src/components/messages.tsx b/e2e-chatbot-app-next/client/src/components/messages.tsx index a0c3748a..4c125d88 100644 --- a/e2e-chatbot-app-next/client/src/components/messages.tsx +++ b/e2e-chatbot-app-next/client/src/components/messages.tsx @@ -18,10 +18,6 @@ interface MessagesProps { isReadonly: boolean; selectedModelId: string; feedback?: FeedbackMap; - // Durable-resume render-time slice: messageId → parts[] index. Text parts - // at indices BEFORE this value are hidden (attempt-1 text); everything else - // renders normally. Tool / step parts are never hidden. - resumeCutIndex?: Record; } function PureMessages({ @@ -34,7 +30,6 @@ function PureMessages({ isReadonly, selectedModelId, feedback = {}, - resumeCutIndex, }: MessagesProps) { const { containerRef: messagesContainerRef, @@ -71,24 +66,10 @@ function PureMessages({ {messages.map((message, index) => { - // Render-time slice: if this message saw a durable-resume boundary, - // hide text parts at indices before the cutoff so only attempt-2 - // text shows. Tool / step parts are kept at every index. - const cut = resumeCutIndex?.[message.id]; - const displayMessage = - cut != null && cut > 0 - ? ({ - ...message, - parts: (message.parts ?? []).filter( - (p: { type?: string }, i: number) => - (p.type !== 'text' || i >= cut), - ), - } as ChatMessage) - : message; return ( ; diff --git a/e2e-chatbot-app-next/server/src/routes/chat.ts b/e2e-chatbot-app-next/server/src/routes/chat.ts index 8f4ee01a..e2f4080e 100644 --- a/e2e-chatbot-app-next/server/src/routes/chat.ts +++ b/e2e-chatbot-app-next/server/src/routes/chat.ts @@ -258,16 +258,6 @@ chatRouter.post('/', requireAuth, async (req: Request, res: Response) => { : {}), }; - // Track whether we've seen a durable-resume boundary so we can forward - // exactly one data-resumed event to the client (which uses it to wipe - // the interrupted attempt's text parts). writerRef is populated by the - // execute() callback below — onChunk runs inside the same stream and - // needs live access to the writer to push a data part mid-stream. - const writerRef: { current: { write: (part: unknown) => void } | null } = { - current: null, - }; - const emittedResumedAttempts = new Set(); - const result = streamText({ model, messages: modelMessages, @@ -291,24 +281,6 @@ chatRouter.post('/', requireAuth, async (req: Request, res: Response) => { if (!traceId && typeof raw?.trace_id === 'string') { traceId = raw.trace_id; } - // LongRunningAgentServer emits this at the attempt-1 → attempt-2 - // boundary after a crash + CAS claim. Forward it once to the - // client as a data-resumed part so the UI can drop the - // interrupted attempt's text parts (tools keep their cards). - if (raw?.type === 'response.resumed' && writerRef.current) { - const attempt = typeof raw?.attempt === 'number' ? raw.attempt : 2; - if (!emittedResumedAttempts.has(attempt)) { - emittedResumedAttempts.add(attempt); - try { - writerRef.current.write({ - type: 'data-resumed', - data: { attempt }, - }); - } catch (e) { - console.warn('[chat] failed to forward data-resumed:', e); - } - } - } } }, onFinish: ({ usage }) => { @@ -331,9 +303,6 @@ chatRouter.post('/', requireAuth, async (req: Request, res: Response) => { // rather than the AI SDK's default short-id format (e.g. "Xt8nZiQRj1fS4yiU"). generateId: generateUUID, execute: async ({ writer }) => { - // Expose writer to onChunk so it can forward data-resumed events - // the instant a durable-resume boundary is observed. - writerRef.current = writer as unknown as typeof writerRef.current; // Manually drain the AI stream so we can append the traceId data part // after all model chunks are processed (traceId is captured via onChunk). // result.toUIMessageStream() converts TextStreamPart → UIMessageChunk: From 7683079af7b61529c265ffb71b9ff3a1fcbec799 Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Thu, 23 Apr 2026 00:21:25 +0000 Subject: [PATCH 31/47] Strip app-templates PR to the bare minimum durability surface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove everything that isn't strictly required for durable resume with the server-side-only approach in ai-bridge PR #416: - agent-langgraph-advanced/agent_server/agent.py: revert entirely. The test-scaffolding tools (get_weather, get_stock_price, deep_research) were only for crash-test harnesses; the asyncio import only existed to support them. User-space durability surface for this template is now zero lines. - agent-openai-advanced/agent_server/agent.py: revert entirely. Drop the test-scaffolding tools (get_weather, get_stock_price, search_best_restaurants, deep_research) and asyncio import. Same zero-user-space result. - agent-langgraph-advanced/agent_server/utils.py: revert. The "middleware nodes that no-op return None" guard was defensive against middleware we no longer install. - agent-openai-advanced/agent_server/utils.py: revert. The empty-input guard was defensive against the old input=[] resume replay that no longer happens — server always replays the original input. - e2e-chatbot-app-next/server/src/index.ts: drop the activeMessage / sealActiveMessage / writeEvent machinery. Was synthesizing closure events on response.resumed to seal attempt-1's text part for the UI wipe. UI wipe is gone; the AI SDK creates parts by item_id so attempt-2's fresh output_item.added naturally starts a new part and attempt-1's open part finalizes on stream end. - Plus the earlier UI cleanup (chat.tsx, messages.tsx, types.ts, routes/chat.ts) that removed the data-resumed / resumeCutIndex plumbing. Remaining essentials: - agent_server/start_server.py: log-level setup so [durable] logs surface in app logs. - scripts/start_app.py: API_PROXY / AGENT_BACKEND_URL wiring so the Node AI SDK routes streaming POSTs through the Express background-mode + auto-resume proxy. Clone-from-branch is marked TEMPORARY (revert when ai-bridge ships). - pyproject.toml: databricks-ai-bridge git source pointer (TEMPORARY). - e2e-chatbot-app-next/server/src/index.ts: background-mode rewrite + auto-resume proxy for the /invocations route. Co-authored-by: Isaac --- .../agent_server/agent.py | 41 ++-------- .../agent_server/utils.py | 3 - agent-openai-advanced/agent_server/agent.py | 65 +--------------- agent-openai-advanced/agent_server/utils.py | 2 - .../client/src/components/chat.tsx | 2 +- .../client/src/components/messages.tsx | 40 +++++----- e2e-chatbot-app-next/server/src/index.ts | 77 +------------------ 7 files changed, 28 insertions(+), 202 deletions(-) diff --git a/agent-langgraph-advanced/agent_server/agent.py b/agent-langgraph-advanced/agent_server/agent.py index 50acb677..98e121b9 100644 --- a/agent-langgraph-advanced/agent_server/agent.py +++ b/agent-langgraph-advanced/agent_server/agent.py @@ -1,4 +1,3 @@ -import asyncio import logging from datetime import datetime from typing import Any, AsyncGenerator, Optional, Sequence, TypedDict @@ -51,34 +50,6 @@ def get_current_time() -> str: return datetime.now().isoformat() -@tool -def get_weather(city: str) -> str: - """Return a short weather summary for the given city.""" - stubs = { - "new york": "72°F, partly cloudy, light wind", - "los angeles": "78°F, sunny, mild humidity", - "tokyo": "65°F, rain, chance of thunderstorms", - } - return stubs.get(city.lower(), f"70°F, clear skies (stub for {city})") - - -@tool -def get_stock_price(ticker: str) -> str: - """Return a simulated stock price for the given ticker symbol.""" - stubs = {"AAPL": "$187.42 (+1.2%)", "GOOGL": "$141.78 (-0.4%)"} - return stubs.get(ticker.upper(), f"$100.00 (stub for {ticker.upper()})") - - -@tool -async def deep_research(topic: str) -> str: - """Run an in-depth multi-source research on the given topic. Takes ~15 seconds.""" - await asyncio.sleep(15) - return ( - f"Research summary on '{topic}': key findings include " - "historical context, current consensus, and two leading counter-arguments." - ) - - class StatefulAgentState(TypedDict, total=False): messages: Annotated[Sequence[AnyMessage], add_messages] custom_inputs: dict[str, Any] @@ -90,7 +61,7 @@ async def init_agent( workspace_client: Optional[WorkspaceClient] = None, checkpointer: Optional[Any] = None, ): - tools = [get_current_time, get_weather, get_stock_price, deep_research] + memory_tools() + tools = [get_current_time] + memory_tools() # To use MCP server tools instead, uncomment the below lines: # mcp_client = init_mcp_client(workspace_client or sp_workspace_client) # try: @@ -139,7 +110,10 @@ async def stream_handler( if user_id: config["configurable"]["user_id"] = user_id - input_messages = to_chat_completions_input([i.model_dump() for i in request.input]) + input_state: dict[str, Any] = { + "messages": to_chat_completions_input([i.model_dump() for i in request.input]), + "custom_inputs": dict(request.custom_inputs or {}), + } try: async with lakebase_context(LAKEBASE_CONFIG) as (checkpointer, store): @@ -149,11 +123,6 @@ async def stream_handler( # For on-behalf-of user authentication, pass get_user_workspace_client() to init_agent. agent = await init_agent(store=store, checkpointer=checkpointer) - input_state: dict[str, Any] = { - "messages": input_messages, - "custom_inputs": dict(request.custom_inputs or {}), - } - async for event in process_agent_astream_events( agent.astream(input_state, config, stream_mode=["updates", "messages"]) ): diff --git a/agent-langgraph-advanced/agent_server/utils.py b/agent-langgraph-advanced/agent_server/utils.py index 0798ae48..75b92de2 100644 --- a/agent-langgraph-advanced/agent_server/utils.py +++ b/agent-langgraph-advanced/agent_server/utils.py @@ -210,9 +210,6 @@ def _end_turn(): elif event[0] == "updates": for node_data in event[1].values(): - # Middleware nodes that no-op return None; skip those cleanly. - if not node_data: - continue messages = node_data.get("messages", []) if not messages: continue diff --git a/agent-openai-advanced/agent_server/agent.py b/agent-openai-advanced/agent_server/agent.py index 50a927d4..db2272f6 100644 --- a/agent-openai-advanced/agent_server/agent.py +++ b/agent-openai-advanced/agent_server/agent.py @@ -1,4 +1,3 @@ -import asyncio import logging from datetime import datetime from typing import AsyncGenerator @@ -43,62 +42,6 @@ def get_current_time() -> str: return datetime.now().isoformat() -@function_tool -def get_weather(city: str) -> str: - """Return a short weather summary for the given city.""" - # Deterministic stub so durable-resume tests can verify the same tool - # result is reused across attempts rather than regenerated. - stubs = { - "new york": "72°F, partly cloudy, light wind", - "los angeles": "78°F, sunny, mild humidity", - "tokyo": "65°F, rain, chance of thunderstorms", - "paris": "60°F, overcast, occasional drizzle", - "london": "55°F, foggy, light rain", - "sydney": "82°F, sunny, breezy", - } - return stubs.get(city.lower(), f"70°F, clear skies (stub for {city})") - - -@function_tool -def get_stock_price(ticker: str) -> str: - """Return a simulated stock price for the given ticker symbol.""" - stubs = { - "AAPL": "$187.42 (+1.2%)", - "GOOGL": "$141.78 (-0.4%)", - "MSFT": "$415.06 (+0.8%)", - "NVDA": "$885.91 (+2.7%)", - "TSLA": "$204.33 (-1.5%)", - } - return stubs.get(ticker.upper(), f"$100.00 (stub for {ticker.upper()})") - - -@function_tool -def search_best_restaurants(city: str) -> str: - """Find a short list of notable restaurants in the given city.""" - stubs = { - "paris": "Le Comptoir du Relais, Septime, Chez L'Ami Jean", - "tokyo": "Sukiyabashi Jiro, Narisawa, Den", - "new york": "Eleven Madison Park, Le Bernardin, Daniel", - } - return stubs.get( - city.lower(), f"Local favorites in {city}: Cafe One, The Bistro, Riverside Kitchen" - ) - - -@function_tool -async def deep_research(topic: str) -> str: - """Run an in-depth multi-source research on the given topic. Takes ~15 seconds.""" - # Deliberately slow so durable-resume tests have a window to kill the - # agent mid-run and prove that tool results committed before the crash - # are preserved in the OpenAI Session and not re-invoked on resume. - await asyncio.sleep(15) - return ( - f"Research summary on '{topic}': key findings include " - "historical context, current consensus, and two leading " - "counter-arguments. (stubbed 15s simulated research)" - ) - - async def init_mcp_server(workspace_client: WorkspaceClient): return McpServer( url=f"{get_databricks_host_from_env()}/api/2.0/mcp/functions/system/ai", @@ -112,13 +55,7 @@ def create_agent(mcp_servers: list[McpServer] | None = None) -> Agent: name="Agent", instructions="You are a helpful assistant.", model="databricks-gpt-5-2", - tools=[ - get_current_time, - get_weather, - get_stock_price, - search_best_restaurants, - deep_research, - ], + tools=[get_current_time], mcp_servers=mcp_servers or [], ) diff --git a/agent-openai-advanced/agent_server/utils.py b/agent-openai-advanced/agent_server/utils.py index 59ff774d..bec8919c 100644 --- a/agent-openai-advanced/agent_server/utils.py +++ b/agent-openai-advanced/agent_server/utils.py @@ -192,8 +192,6 @@ async def deduplicate_input(request: ResponsesAgentRequest, session: AsyncDatabr since the session will prepend the full history automatically. """ messages = [i.model_dump() for i in request.input] - if not messages: - return [] # Normalize assistant message content from string to structured list format. # MLflow evaluation sends assistant content as a plain string, but the OpenAI # Agents SDK expects it as [{"type": "output_text", "text": ..., "annotations": []}]. diff --git a/e2e-chatbot-app-next/client/src/components/chat.tsx b/e2e-chatbot-app-next/client/src/components/chat.tsx index 9e5fd79c..735b894c 100644 --- a/e2e-chatbot-app-next/client/src/components/chat.tsx +++ b/e2e-chatbot-app-next/client/src/components/chat.tsx @@ -302,7 +302,7 @@ export function Chat({ return (
-
+
{inputElement} diff --git a/e2e-chatbot-app-next/client/src/components/messages.tsx b/e2e-chatbot-app-next/client/src/components/messages.tsx index 4c125d88..8b740452 100644 --- a/e2e-chatbot-app-next/client/src/components/messages.tsx +++ b/e2e-chatbot-app-next/client/src/components/messages.tsx @@ -65,27 +65,25 @@ function PureMessages({ > - {messages.map((message, index) => { - return ( - - ); - })} + {messages.map((message, index) => ( + + ))} {status === 'submitted' && messages.length > 0 && diff --git a/e2e-chatbot-app-next/server/src/index.ts b/e2e-chatbot-app-next/server/src/index.ts index 95ea7879..73803de4 100644 --- a/e2e-chatbot-app-next/server/src/index.ts +++ b/e2e-chatbot-app-next/server/src/index.ts @@ -143,48 +143,8 @@ if (agentBackendUrl) { const MAX_RESUME_ATTEMPTS = 10; let resumeAttempt = 0; - // Tracks the in-progress assistant message item so we can emit - // synthetic closure events on resume. Without this, attempt 2's - // deltas append to attempt 1's text part in the AI SDK's state. - type ActiveMessage = { - itemId: string; - outputIndex: number; - contentIndex: number; - text: string; - }; - let activeMessage: ActiveMessage | null = null; - - const writeEvent = (type: string, payload: Record) => { - res.write(`event: ${type}\ndata: ${JSON.stringify({ type, ...payload })}\n\n`); - }; - - // Emit content_part.done + output_item.done for the active message so - // the Vercel AI SDK finalizes its text part and starts a fresh one on - // attempt 2's next output_item.added. - const sealActiveMessage = () => { - if (!activeMessage) return; - const { itemId, outputIndex, contentIndex, text } = activeMessage; - writeEvent('response.content_part.done', { - item_id: itemId, - output_index: outputIndex, - content_index: contentIndex, - part: { type: 'output_text', text, annotations: [] }, - }); - writeEvent('response.output_item.done', { - output_index: outputIndex, - item: { - id: itemId, - type: 'message', - role: 'assistant', - status: 'completed', - content: [{ type: 'output_text', text, annotations: [] }], - }, - }); - activeMessage = null; - }; - - // Read one SSE stream, track metadata + in-progress items, optionally - // emit synthetic closure events, then forward each frame to the client. + // Read one SSE stream, extract response_id + sequence_number + detect + // terminal errors, forward each frame to the client. // Returns whether we saw the [DONE] sentinel. const pumpStream = async (upstream: globalThis.Response) => { if (!upstream.body) return false; @@ -244,39 +204,6 @@ if (agentBackendUrl) { lastSeq = parsed.sequence_number as number; } const eventType = parsed.type as string | undefined; - const item = (parsed.item as Record | undefined) ?? undefined; - // Update activeMessage state (pre-forward). - if ( - eventType === 'response.output_item.added' && - item?.type === 'message' - ) { - activeMessage = { - itemId: (item.id as string) || '', - outputIndex: (parsed.output_index as number) ?? 0, - contentIndex: 0, - text: '', - }; - } else if ( - eventType === 'response.output_text.delta' && - activeMessage && - (parsed.item_id as string) === activeMessage.itemId - ) { - activeMessage.text += (parsed.delta as string) ?? ''; - } else if ( - eventType === 'response.output_item.done' && - item?.type === 'message' && - activeMessage?.itemId === (item.id as string) - ) { - // Backend closed the message itself; we don't need our synthetic - // closure. - activeMessage = null; - } - // On the resume sentinel, seal the active message before - // forwarding the sentinel. Attempt 2 emits a fresh output_item.added - // with a new id, so the AI SDK starts a clean text part for it. - if (eventType === 'response.resumed' && activeMessage) { - sealActiveMessage(); - } // Detect terminal errors (task_failed, task_timeout, etc.) so we // don't burn MAX_RESUME_ATTEMPTS fetching a response that will // never succeed. Upstream LLM 502s and permanent run failures From 5f3c5076d6d779240d3674a71ef5e326404c71a9 Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Thu, 23 Apr 2026 05:16:10 +0000 Subject: [PATCH 32/47] chat: cap resume attempts per turn, don't reset on each chunk Infinite Stream Resume loop seen with Claude multi-tool turns via durable retrieve. Root: - useChat's onStreamPart reset resumeAttemptCountRef on every chunk, so the 3-retry cap was only enforced when a stream ended empty. When Claude's provider failed to emit a clean `finish` UIMessageChunk at the end of the stream, lastPart.type !== 'finish' kept streamIncomplete = true. Each resume replayed the cached stream, delivered chunks, reset the counter to 0, onFinish fired without `finish`, looped. Fix: - Remove the per-chunk reset in onStreamPart. - Reset only in prepareSendMessagesRequest when the last message is a user message (a genuine new turn). Tool-result continuations (non-user-message continuations) don't reset. - Cap stays at 3; after that, fetchChatHistory() pulls the DB-persisted state so the user sees the final assistant output instead of spinning forever. Co-authored-by: Isaac --- .../client/src/components/chat.tsx | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/e2e-chatbot-app-next/client/src/components/chat.tsx b/e2e-chatbot-app-next/client/src/components/chat.tsx index 735b894c..36be050e 100644 --- a/e2e-chatbot-app-next/client/src/components/chat.tsx +++ b/e2e-chatbot-app-next/client/src/components/chat.tsx @@ -65,7 +65,13 @@ export function Chat({ const lastPartRef = useRef(lastPart); lastPartRef.current = lastPart; - // Single counter for resume attempts - reset when stream parts are received + // Absolute cap on resume attempts per chat turn. Not reset on each chunk + // because some provider/stream combinations (e.g. Claude via openai-agents + // through the durable retrieve path) never emit a clean `finish` + // UIMessageChunk — which leaves `streamIncomplete` perpetually true and, + // combined with a cursor-less replay, caused a runaway retry loop when + // reset-on-chunk was in place. We reset only when a fresh user message + // kicks off a new turn (see sendMessage wrapping below). const resumeAttemptCountRef = useRef(0); const maxResumeAttempts = 3; @@ -124,8 +130,6 @@ export function Chat({ } didFetchHistoryOnNewChat.current = true; } - // Reset resume attempts when we successfully receive stream parts - resumeAttemptCountRef.current = 0; setLastPart(part); }, api: '/api/chat', @@ -133,6 +137,11 @@ export function Chat({ prepareSendMessagesRequest({ messages, id, body }) { const lastMessage = messages.at(-1); const isUserMessage = lastMessage?.role === 'user'; + // A fresh user message starts a new turn — reset the resume counter. + // Tool-result continuations (non-user messages) don't reset. + if (isUserMessage) { + resumeAttemptCountRef.current = 0; + } // For continuations (non-user messages like tool results), we must always // send previousMessages because the tool result only exists client-side From 0ddbd602ce442faceffce1b622cb0385fccb5e6b Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Thu, 23 Apr 2026 05:42:39 +0000 Subject: [PATCH 33/47] =?UTF-8?q?Openai=20template:=20stable=20=E2=80=94?= =?UTF-8?q?=20per-type=20+=20per-call-id=20stream=20id=20tracking?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Final stable state for durable execution. End-to-end UI-validated scenarios that now work: - Multi-tool turn interrupted mid-sequence, durable resume inherits completed tool pairs + narrative (reordered) + synthetic output for the interrupted call, agent continues from where it left off. - Text-only mid-stream crash, partial-text reassembly + Claude prefill → continuation. - Cross-turn recall after crash-and-resume (stable thread via read- time checkpoint repair on LangGraph / session auto-repair on OpenAI). - Multi-tool on GPT-5 + openai-agents (single-response-per-turn). Template fix here: process_agent_stream_events now disambiguates by (a) item.type bucket for delta routing and (b) call_id bucket for multiple open function_calls. The original single curr_item_id bucket worked for GPT-5's strictly serial events but collided on Claude's interleaved + parallel tool-call events, which produced two items sharing one id and broke the client's part tracking. Pairs with databricks-ai-bridge PR #416 changes (rotate + replay + full-history sanitizer + prior-attempt tool-pair inheritance + narrative hoist + checkpoint read-time repair + session auto-repair). Co-authored-by: Isaac --- agent-openai-advanced/agent_server/utils.py | 72 ++++++++++++++++++--- 1 file changed, 64 insertions(+), 8 deletions(-) diff --git a/agent-openai-advanced/agent_server/utils.py b/agent-openai-advanced/agent_server/utils.py index bec8919c..9553369a 100644 --- a/agent-openai-advanced/agent_server/utils.py +++ b/agent-openai-advanced/agent_server/utils.py @@ -209,23 +209,79 @@ async def deduplicate_input(request: ResponsesAgentRequest, session: AsyncDatabr return messages +_DELTA_EVENT_TO_ITEM_TYPE = { + "response.output_text.delta": "message", + "response.content_part.added": "message", + "response.content_part.done": "message", + "response.function_call_arguments.delta": "function_call", + "response.function_call_arguments.done": "function_call", + "response.reasoning.delta": "reasoning", + "response.reasoning_summary_text.delta": "reasoning", +} + + async def process_agent_stream_events( async_stream: AsyncIterator[StreamEvent], response_id: str | None = None, ) -> AsyncGenerator[ResponsesAgentStreamEvent, None]: - curr_item_id = str(uuid4()) + # Runner streams every event with FAKE_RESPONSES_ID, so we assign uuids + # and reconstruct ownership. Two dimensions of disambiguation are + # required for providers that emit parallel / interleaved events + # (notably Claude via the Databricks OpenAI-compatible endpoint): + # + # 1. Items of different types interleave (function_call.added then + # message.added, then function_call.done), so we keep the current + # id bucket per item.type for delta routing. + # 2. Multiple function_calls can be open at once (Claude parallel tool + # calling). Each call_id must map to its own uuid so the .done + # frames land on the right item — otherwise the most recent call's + # uuid gets stamped on every function_call.done. + current_id_by_type: dict[str, str] = {} + fc_id_by_call_id: dict[str, str] = {} + + def fc_id_for(call_id: str) -> str: + if call_id not in fc_id_by_call_id: + fc_id_by_call_id[call_id] = str(uuid4()) + return fc_id_by_call_id[call_id] + async for event in async_stream: if event.type == "raw_response_event": event_data = event.data.model_dump() if response_id is not None: event_data = replace_fake_id(event_data, response_id) - if event_data["type"] == "response.output_item.added": - curr_item_id = str(uuid4()) - event_data["item"]["id"] = curr_item_id - elif event_data.get("item") is not None and event_data["item"].get("id") is not None: - event_data["item"]["id"] = curr_item_id - elif event_data.get("item_id") is not None: - event_data["item_id"] = curr_item_id + + event_type = event_data.get("type", "") + item = event_data.get("item") if isinstance(event_data.get("item"), dict) else None + item_type = item.get("type") if item else None + call_id = item.get("call_id") if item else None + + if event_type == "response.output_item.added" and item_type: + if item_type == "function_call" and call_id: + new_id = fc_id_for(call_id) + else: + new_id = str(uuid4()) + current_id_by_type[item_type] = new_id + event_data["item"]["id"] = new_id + elif event_type == "response.output_item.done" and item_type: + if item_type == "function_call" and call_id: + # Look up by call_id — parallel tool calls each have + # their own registered uuid. + event_data["item"]["id"] = fc_id_for(call_id) + elif item_type in current_id_by_type: + event_data["item"]["id"] = current_id_by_type[item_type] + elif event_type in _DELTA_EVENT_TO_ITEM_TYPE: + # Delta frames carry only item_id, not call_id. Route to the + # most recently opened item of the owning type — this is + # what Runner's stream actually implies (the current args + # delta belongs to the current function_call). + owner_type = _DELTA_EVENT_TO_ITEM_TYPE[event_type] + owner_id = current_id_by_type.get(owner_type) + if owner_id: + if event_data.get("item_id") is not None: + event_data["item_id"] = owner_id + if item and item.get("id") is not None: + event_data["item"]["id"] = owner_id + yield event_data elif event.type == "run_item_stream_event" and event.item.type == "tool_call_output_item": yield ResponsesAgentStreamEvent( From 24140b3bfc0dac251c3ca5ce86fe4f4e58c7ed27 Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Thu, 23 Apr 2026 06:31:06 +0000 Subject: [PATCH 34/47] Revert template-side durable-resume hardening; bridge fix suffices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit End-to-end UI test on Claude (via deployed agent-openai-advanced with the updated databricks-ai-bridge) confirmed that the bridge-side ordering fix (sanitizer + narrative hoist + tool-pair inheritance + session auto-repair) is sufficient on its own. The two template-side guards added in earlier commits are no longer needed: - Revert 0ddbd60: `process_agent_stream_events` per-type + per-call-id id tracking. The single-bucket implementation handles Claude's interleaved + parallel tool-call events correctly now that the upstream ordering is clean. - Revert 5f3c507: `chat.tsx` user-message-only resume-counter reset. Claude now emits a clean `finish` UIMessageChunk through the durable retrieve path, so the per-chunk reset no longer traps the 3-retry cap in an infinite loop. Keeps the advanced templates lean — durability logic lives entirely in databricks-ai-bridge (LongRunningAgentServer). Co-authored-by: Isaac --- agent-openai-advanced/agent_server/utils.py | 72 +++---------------- .../client/src/components/chat.tsx | 15 +--- 2 files changed, 11 insertions(+), 76 deletions(-) diff --git a/agent-openai-advanced/agent_server/utils.py b/agent-openai-advanced/agent_server/utils.py index 9553369a..bec8919c 100644 --- a/agent-openai-advanced/agent_server/utils.py +++ b/agent-openai-advanced/agent_server/utils.py @@ -209,79 +209,23 @@ async def deduplicate_input(request: ResponsesAgentRequest, session: AsyncDatabr return messages -_DELTA_EVENT_TO_ITEM_TYPE = { - "response.output_text.delta": "message", - "response.content_part.added": "message", - "response.content_part.done": "message", - "response.function_call_arguments.delta": "function_call", - "response.function_call_arguments.done": "function_call", - "response.reasoning.delta": "reasoning", - "response.reasoning_summary_text.delta": "reasoning", -} - - async def process_agent_stream_events( async_stream: AsyncIterator[StreamEvent], response_id: str | None = None, ) -> AsyncGenerator[ResponsesAgentStreamEvent, None]: - # Runner streams every event with FAKE_RESPONSES_ID, so we assign uuids - # and reconstruct ownership. Two dimensions of disambiguation are - # required for providers that emit parallel / interleaved events - # (notably Claude via the Databricks OpenAI-compatible endpoint): - # - # 1. Items of different types interleave (function_call.added then - # message.added, then function_call.done), so we keep the current - # id bucket per item.type for delta routing. - # 2. Multiple function_calls can be open at once (Claude parallel tool - # calling). Each call_id must map to its own uuid so the .done - # frames land on the right item — otherwise the most recent call's - # uuid gets stamped on every function_call.done. - current_id_by_type: dict[str, str] = {} - fc_id_by_call_id: dict[str, str] = {} - - def fc_id_for(call_id: str) -> str: - if call_id not in fc_id_by_call_id: - fc_id_by_call_id[call_id] = str(uuid4()) - return fc_id_by_call_id[call_id] - + curr_item_id = str(uuid4()) async for event in async_stream: if event.type == "raw_response_event": event_data = event.data.model_dump() if response_id is not None: event_data = replace_fake_id(event_data, response_id) - - event_type = event_data.get("type", "") - item = event_data.get("item") if isinstance(event_data.get("item"), dict) else None - item_type = item.get("type") if item else None - call_id = item.get("call_id") if item else None - - if event_type == "response.output_item.added" and item_type: - if item_type == "function_call" and call_id: - new_id = fc_id_for(call_id) - else: - new_id = str(uuid4()) - current_id_by_type[item_type] = new_id - event_data["item"]["id"] = new_id - elif event_type == "response.output_item.done" and item_type: - if item_type == "function_call" and call_id: - # Look up by call_id — parallel tool calls each have - # their own registered uuid. - event_data["item"]["id"] = fc_id_for(call_id) - elif item_type in current_id_by_type: - event_data["item"]["id"] = current_id_by_type[item_type] - elif event_type in _DELTA_EVENT_TO_ITEM_TYPE: - # Delta frames carry only item_id, not call_id. Route to the - # most recently opened item of the owning type — this is - # what Runner's stream actually implies (the current args - # delta belongs to the current function_call). - owner_type = _DELTA_EVENT_TO_ITEM_TYPE[event_type] - owner_id = current_id_by_type.get(owner_type) - if owner_id: - if event_data.get("item_id") is not None: - event_data["item_id"] = owner_id - if item and item.get("id") is not None: - event_data["item"]["id"] = owner_id - + if event_data["type"] == "response.output_item.added": + curr_item_id = str(uuid4()) + event_data["item"]["id"] = curr_item_id + elif event_data.get("item") is not None and event_data["item"].get("id") is not None: + event_data["item"]["id"] = curr_item_id + elif event_data.get("item_id") is not None: + event_data["item_id"] = curr_item_id yield event_data elif event.type == "run_item_stream_event" and event.item.type == "tool_call_output_item": yield ResponsesAgentStreamEvent( diff --git a/e2e-chatbot-app-next/client/src/components/chat.tsx b/e2e-chatbot-app-next/client/src/components/chat.tsx index 36be050e..735b894c 100644 --- a/e2e-chatbot-app-next/client/src/components/chat.tsx +++ b/e2e-chatbot-app-next/client/src/components/chat.tsx @@ -65,13 +65,7 @@ export function Chat({ const lastPartRef = useRef(lastPart); lastPartRef.current = lastPart; - // Absolute cap on resume attempts per chat turn. Not reset on each chunk - // because some provider/stream combinations (e.g. Claude via openai-agents - // through the durable retrieve path) never emit a clean `finish` - // UIMessageChunk — which leaves `streamIncomplete` perpetually true and, - // combined with a cursor-less replay, caused a runaway retry loop when - // reset-on-chunk was in place. We reset only when a fresh user message - // kicks off a new turn (see sendMessage wrapping below). + // Single counter for resume attempts - reset when stream parts are received const resumeAttemptCountRef = useRef(0); const maxResumeAttempts = 3; @@ -130,6 +124,8 @@ export function Chat({ } didFetchHistoryOnNewChat.current = true; } + // Reset resume attempts when we successfully receive stream parts + resumeAttemptCountRef.current = 0; setLastPart(part); }, api: '/api/chat', @@ -137,11 +133,6 @@ export function Chat({ prepareSendMessagesRequest({ messages, id, body }) { const lastMessage = messages.at(-1); const isUserMessage = lastMessage?.role === 'user'; - // A fresh user message starts a new turn — reset the resume counter. - // Tool-result continuations (non-user messages) don't reset. - if (isUserMessage) { - resumeAttemptCountRef.current = 0; - } // For continuations (non-user messages like tool results), we must always // send previousMessages because the tool result only exists client-side From db9fb45edf24507ec68a2020dd5fae6fdacdb443 Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Thu, 23 Apr 2026 06:31:27 +0000 Subject: [PATCH 35/47] Refactor /invocations pumpStream into SSE parsing helpers Extract three pure helpers above the route handler so the SSE frame loop reads like prose: - parseSseFrame(frame): classifies a frame as done / passthrough / data. - extractResponseId(payload): tolerates FastAPI's three response_id locations (response_id, response.id, top-level id with resp_ prefix). - isTerminalErrorFrame(payload): detects task_failed / task_timeout so the resume loop can short-circuit. pumpStream now just drives the reader + forwards bytes; the parsing logic is testable in isolation and the handler body is substantially shorter. Co-authored-by: Isaac --- e2e-chatbot-app-next/server/src/index.ts | 131 +++++++++++++---------- 1 file changed, 75 insertions(+), 56 deletions(-) diff --git a/e2e-chatbot-app-next/server/src/index.ts b/e2e-chatbot-app-next/server/src/index.ts index 73803de4..25a50fa1 100644 --- a/e2e-chatbot-app-next/server/src/index.ts +++ b/e2e-chatbot-app-next/server/src/index.ts @@ -24,6 +24,58 @@ import { ChatSDKError } from '@chat-template/core/errors'; const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); +// --------------------------------------------------------------------------- +// SSE parsing helpers for the durable-resume /invocations proxy. +// Kept at module scope so each piece is a tiny, single-responsibility, easy +// to read / unit-test pure function. The proxy handler below composes them. +// --------------------------------------------------------------------------- + +type ParsedSseFrame = + | { kind: 'done' } + | { kind: 'passthrough' } + | { kind: 'data'; payload: Record }; + +/** Classify an SSE frame as [DONE], data (parseable JSON), or opaque. */ +function parseSseFrame(frame: string): ParsedSseFrame { + if (frame.includes('data: [DONE]')) return { kind: 'done' }; + const dataLine = frame.split('\n').find((l) => l.startsWith('data:')); + if (!dataLine) return { kind: 'passthrough' }; + try { + const payload = JSON.parse(dataLine.slice(5).trim()); + if (payload && typeof payload === 'object') { + return { kind: 'data', payload: payload as Record }; + } + } catch { + // Non-JSON frame (e.g. heartbeat): fall through to passthrough. + } + return { kind: 'passthrough' }; +} + +/** Pull the response_id off an SSE payload — tolerates the three shapes + * emitted by FastAPI ( `response_id`, `response.id`, top-level `id` prefixed + * with `resp_`). + */ +function extractResponseId( + payload: Record, +): string | null { + if (typeof payload.response_id === 'string') return payload.response_id; + const nested = payload.response as { id?: unknown } | undefined; + if (typeof nested?.id === 'string') return nested.id; + const topId = payload.id; + if (typeof topId === 'string' && topId.startsWith('resp_')) return topId; + return null; +} + +/** True for upstream error frames that will never succeed on retry + * (``task_failed`` / ``task_timeout``). Lets the proxy short-circuit its + * resume loop instead of burning the full attempt budget. */ +function isTerminalErrorFrame(payload: Record): boolean { + if (payload.type !== 'error') return false; + const err = (payload.error as Record | undefined) ?? {}; + const code = err.code as string | undefined; + return code === 'task_failed' || code === 'task_timeout'; +} + const app: Express = express(); const isDevelopment = process.env.NODE_ENV !== 'production'; // Either let PORT be set by env or use 3001 for development and 3000 for production @@ -144,76 +196,44 @@ if (agentBackendUrl) { let resumeAttempt = 0; // Read one SSE stream, extract response_id + sequence_number + detect - // terminal errors, forward each frame to the client. - // Returns whether we saw the [DONE] sentinel. - const pumpStream = async (upstream: globalThis.Response) => { + // terminal errors, forward each frame to the client. Returns whether we + // saw the [DONE] sentinel. Parsing is delegated to the module-level + // helpers above. + const pumpStream = async ( + upstream: globalThis.Response, + ): Promise => { if (!upstream.body) return false; const reader = upstream.body.getReader(); const decoder = new TextDecoder(); let buf = ''; while (true) { const { done, value } = await reader.read(); - if (done) break; + if (done) return false; buf += decoder.decode(value, { stream: true }); const frames = buf.split(/\n\n/); buf = frames.pop() || ''; for (const frame of frames) { const frameBytes = `${frame}\n\n`; - if (frame.includes('data: [DONE]')) { + const parsed = parseSseFrame(frame); + if (parsed.kind === 'done') { res.write(frameBytes); return true; } - const dataLine = frame.split('\n').find((l) => l.startsWith('data:')); - if (!dataLine) { - res.write(frameBytes); - continue; - } - let parsed: Record | undefined; - try { - parsed = JSON.parse(dataLine.slice(5).trim()); - } catch { - // Non-JSON SSE frame (e.g. heartbeats) — forward as-is. - res.write(frameBytes); - continue; - } - if (!parsed) { - res.write(frameBytes); - continue; - } - // Track response_id (several possible locations). - const nested = parsed.response as - | { id?: unknown } - | undefined; - const rid = - (typeof parsed.response_id === 'string' - ? (parsed.response_id as string) - : undefined) ?? - (typeof nested?.id === 'string' ? nested.id : undefined) ?? - (typeof parsed.id === 'string' && - (parsed.id as string).startsWith('resp_') - ? (parsed.id as string) - : undefined); - if (!responseId && typeof rid === 'string') { - responseId = rid; - console.log(`[/invocations] background started response_id=${responseId}`); - } - if ( - typeof parsed.sequence_number === 'number' && - (parsed.sequence_number as number) > lastSeq - ) { - lastSeq = parsed.sequence_number as number; - } - const eventType = parsed.type as string | undefined; - // Detect terminal errors (task_failed, task_timeout, etc.) so we - // don't burn MAX_RESUME_ATTEMPTS fetching a response that will - // never succeed. Upstream LLM 502s and permanent run failures - // both surface here. - if (eventType === 'error') { - const errObj = (parsed.error as Record) || {}; - const code = errObj.code as string | undefined; - if (code === 'task_failed' || code === 'task_timeout') { + if (parsed.kind === 'data') { + const rid = extractResponseId(parsed.payload); + if (rid && !responseId) { + responseId = rid; + console.log( + `[/invocations] background started response_id=${responseId}`, + ); + } + const seq = parsed.payload.sequence_number; + if (typeof seq === 'number' && seq > lastSeq) { + lastSeq = seq; + } + if (isTerminalErrorFrame(parsed.payload)) { console.log( - `[/invocations] terminal error code=${code} response_id=${responseId}; not retrying`, + `[/invocations] terminal error response_id=${responseId}; not retrying`, ); sawTerminalError = true; } @@ -221,7 +241,6 @@ if (agentBackendUrl) { res.write(frameBytes); } } - return false; }; // Kickoff: POST background request. From 9f7e95d3ab87c4ded321426b18841258dc834a97 Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Thu, 23 Apr 2026 18:13:05 +0000 Subject: [PATCH 36/47] Default API_PROXY/AGENT_BACKEND_URL in chatbot, drop from advanced yaml MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both advanced templates were setting these env vars to hard-coded localhost URLs that match the bundled-process topology (Node on 3000, FastAPI on 8000). The values are fixed by the templates themselves — a customer deploying the advanced stack can't change them without breaking the bundle. Making them required in yaml adds noise without adding configurability. Push the defaults into the chatbot: - New ``getApiProxyUrl()`` helper in ``packages/ai-sdk-providers/src/ api-proxy.ts`` resolves the effective proxy URL: 1. explicit ``API_PROXY`` wins, 2. ``DATABRICKS_SERVING_ENDPOINT`` set → direct-endpoint mode, no proxy, 3. otherwise → ``http://localhost:${CHAT_APP_PORT|PORT|3000}/invocations`` (advanced-template convention). Used from ``providers-server.ts`` and ``request-context.ts`` so both agree on proxy activation. - ``server/src/index.ts`` defaults ``AGENT_BACKEND_URL`` to ``http://localhost:8000/invocations`` when unset. Explicit empty string still disables the ``/invocations`` proxy route. - Drop the ``API_PROXY`` / ``AGENT_BACKEND_URL`` block (and its comment) from both advanced templates' ``app.yaml`` and ``databricks.yml``. Preserves direct-serving-endpoint CUJs: when ``DATABRICKS_SERVING_ENDPOINT`` is set (basic chatbot deployments), the AI SDK talks straight to the endpoint and never hits ``/invocations``. Co-authored-by: Isaac --- agent-langgraph-advanced/app.yaml | 7 ------- agent-langgraph-advanced/databricks.yml | 7 ------- agent-openai-advanced/app.yaml | 7 ------- agent-openai-advanced/databricks.yml | 7 ------- .../packages/ai-sdk-providers/src/api-proxy.ts | 18 ++++++++++++++++++ .../ai-sdk-providers/src/providers-server.ts | 4 +++- .../ai-sdk-providers/src/request-context.ts | 8 ++++---- e2e-chatbot-app-next/server/src/index.ts | 8 +++++++- 8 files changed, 32 insertions(+), 34 deletions(-) create mode 100644 e2e-chatbot-app-next/packages/ai-sdk-providers/src/api-proxy.ts diff --git a/agent-langgraph-advanced/app.yaml b/agent-langgraph-advanced/app.yaml index aac3edf7..24ed65cc 100644 --- a/agent-langgraph-advanced/app.yaml +++ b/agent-langgraph-advanced/app.yaml @@ -6,13 +6,6 @@ env: value: "databricks" - name: MLFLOW_REGISTRY_URI value: "databricks-uc" - # API_PROXY points the Node AI SDK at the Express /invocations handler - # (port 3000) which rewrites to background mode and auto-resumes on - # disconnect. AGENT_BACKEND_URL is where that handler forwards. - - name: API_PROXY - value: "http://localhost:3000/invocations" - - name: AGENT_BACKEND_URL - value: "http://localhost:8000/invocations" - name: CHAT_APP_PORT value: "3000" - name: CHAT_PROXY_TIMEOUT_SECONDS diff --git a/agent-langgraph-advanced/databricks.yml b/agent-langgraph-advanced/databricks.yml index 94028630..2c826345 100644 --- a/agent-langgraph-advanced/databricks.yml +++ b/agent-langgraph-advanced/databricks.yml @@ -14,13 +14,6 @@ resources: value: "databricks" - name: MLFLOW_REGISTRY_URI value: "databricks-uc" - # API_PROXY points the Node AI SDK at the Express /invocations - # handler which rewrites to background mode and auto-resumes on - # disconnect. AGENT_BACKEND_URL is where that handler forwards. - - name: API_PROXY - value: "http://localhost:3000/invocations" - - name: AGENT_BACKEND_URL - value: "http://localhost:8000/invocations" - name: CHAT_APP_PORT value: "3000" - name: CHAT_PROXY_TIMEOUT_SECONDS diff --git a/agent-openai-advanced/app.yaml b/agent-openai-advanced/app.yaml index fab84034..24ed65cc 100644 --- a/agent-openai-advanced/app.yaml +++ b/agent-openai-advanced/app.yaml @@ -6,13 +6,6 @@ env: value: "databricks" - name: MLFLOW_REGISTRY_URI value: "databricks-uc" - # API_PROXY points the Node AI SDK at the Express /invocations handler - # (port 3000), which rewrites to background mode and auto-resumes on - # disconnect. AGENT_BACKEND_URL is where that handler forwards. - - name: API_PROXY - value: "http://localhost:3000/invocations" - - name: AGENT_BACKEND_URL - value: "http://localhost:8000/invocations" - name: CHAT_APP_PORT value: "3000" - name: CHAT_PROXY_TIMEOUT_SECONDS diff --git a/agent-openai-advanced/databricks.yml b/agent-openai-advanced/databricks.yml index 0874eb09..c943a4e7 100644 --- a/agent-openai-advanced/databricks.yml +++ b/agent-openai-advanced/databricks.yml @@ -15,13 +15,6 @@ resources: value: "databricks" - name: MLFLOW_REGISTRY_URI value: "databricks-uc" - # API_PROXY points the Node AI SDK at the Express /invocations - # handler which rewrites to background mode and auto-resumes on - # disconnect. AGENT_BACKEND_URL is where that handler forwards. - - name: API_PROXY - value: "http://localhost:3000/invocations" - - name: AGENT_BACKEND_URL - value: "http://localhost:8000/invocations" - name: CHAT_APP_PORT value: "3000" - name: CHAT_PROXY_TIMEOUT_SECONDS diff --git a/e2e-chatbot-app-next/packages/ai-sdk-providers/src/api-proxy.ts b/e2e-chatbot-app-next/packages/ai-sdk-providers/src/api-proxy.ts new file mode 100644 index 00000000..203abd55 --- /dev/null +++ b/e2e-chatbot-app-next/packages/ai-sdk-providers/src/api-proxy.ts @@ -0,0 +1,18 @@ +/** + * Single source of truth for the "route chat through the local /invocations + * proxy" decision. Returns the URL the AI SDK provider should POST to, or + * undefined when the chatbot should talk directly to a Databricks serving + * endpoint. + * + * Resolution order: + * 1. Explicit ``API_PROXY`` env var — caller knows best. + * 2. ``DATABRICKS_SERVING_ENDPOINT`` set — direct-endpoint mode; no proxy. + * 3. Advanced-template default — assume a local FastAPI agent is reachable + * via this Node server's own /invocations route, and route there. + */ +export function getApiProxyUrl(): string | undefined { + if (process.env.API_PROXY) return process.env.API_PROXY; + if (process.env.DATABRICKS_SERVING_ENDPOINT) return undefined; + const port = process.env.CHAT_APP_PORT || process.env.PORT || '3000'; + return `http://localhost:${port}/invocations`; +} diff --git a/e2e-chatbot-app-next/packages/ai-sdk-providers/src/providers-server.ts b/e2e-chatbot-app-next/packages/ai-sdk-providers/src/providers-server.ts index f19dad35..5d9220e7 100644 --- a/e2e-chatbot-app-next/packages/ai-sdk-providers/src/providers-server.ts +++ b/e2e-chatbot-app-next/packages/ai-sdk-providers/src/providers-server.ts @@ -71,7 +71,9 @@ export async function getWorkspaceHostname(): Promise { // Environment variable to enable SSE logging const LOG_SSE_EVENTS = process.env.LOG_SSE_EVENTS === 'true'; -const API_PROXY = process.env.API_PROXY; +import { getApiProxyUrl } from './api-proxy'; + +const API_PROXY = getApiProxyUrl(); // Cache for endpoint details to check task type and OBO scopes const endpointDetailsCache = new Map< diff --git a/e2e-chatbot-app-next/packages/ai-sdk-providers/src/request-context.ts b/e2e-chatbot-app-next/packages/ai-sdk-providers/src/request-context.ts index 4f08882a..52cd3788 100644 --- a/e2e-chatbot-app-next/packages/ai-sdk-providers/src/request-context.ts +++ b/e2e-chatbot-app-next/packages/ai-sdk-providers/src/request-context.ts @@ -2,11 +2,13 @@ * Utility functions for request context handling. */ +import { getApiProxyUrl } from './api-proxy'; + /** * Determines whether context should be injected based on endpoint type. * * Context is injected when: - * 1. Using API_PROXY environment variable, OR + * 1. The Express /invocations proxy is in play (explicit or inferred), OR * 2. Endpoint task type is 'agent/v2/chat' or 'agent/v1/responses' * * @param endpointTask - The task type of the serving endpoint (optional) @@ -15,9 +17,7 @@ export function shouldInjectContextForEndpoint( endpointTask: string | undefined, ): boolean { - const API_PROXY = process.env.API_PROXY; - - if (API_PROXY) { + if (getApiProxyUrl()) { return true; } diff --git a/e2e-chatbot-app-next/server/src/index.ts b/e2e-chatbot-app-next/server/src/index.ts index 25a50fa1..7aa9007c 100644 --- a/e2e-chatbot-app-next/server/src/index.ts +++ b/e2e-chatbot-app-next/server/src/index.ts @@ -124,7 +124,13 @@ app.use('/api/feedback', feedbackRouter); // set API_PROXY at THIS Express server (e.g. http://localhost:3000/invocations) // so the AI SDK provider in providers-server.ts routes through this handler // instead of going direct to FastAPI. -const agentBackendUrl = process.env.AGENT_BACKEND_URL || process.env.API_PROXY; +// Default to the advanced-template convention (FastAPI on :8000). Set +// AGENT_BACKEND_URL explicitly to point at a remote agent, or set it to +// empty string to disable the /invocations proxy altogether. +const agentBackendUrl = + process.env.AGENT_BACKEND_URL ?? + process.env.API_PROXY ?? + 'http://localhost:8000/invocations'; if (agentBackendUrl) { console.log( `✅ Proxying /invocations to ${agentBackendUrl} (durable-resume enabled)`, From ec73bc9e99513c010b536e93d7ae56ba6ba636af Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Thu, 23 Apr 2026 18:16:58 +0000 Subject: [PATCH 37/47] Restore API_PROXY line on advanced yaml files to match main Prior cleanup commit dropped ``API_PROXY=http://localhost:8000/invocations`` from the advanced templates' ``app.yaml`` and ``databricks.yml``. That line pre-existed on ``main``; the PR never meant to remove it. Scope of the previous change was only the *newly-added* ``API_PROXY`` + ``AGENT_BACKEND_URL`` block that activated the Node proxy path. Restore the four files to exactly match ``main``. The chatbot-side ``getApiProxyUrl()`` default only fires when ``API_PROXY`` is unset, so users with main's explicit setting keep their existing behavior. Co-authored-by: Isaac --- agent-langgraph-advanced/app.yaml | 2 ++ agent-langgraph-advanced/databricks.yml | 2 ++ agent-openai-advanced/app.yaml | 2 ++ agent-openai-advanced/databricks.yml | 2 ++ 4 files changed, 8 insertions(+) diff --git a/agent-langgraph-advanced/app.yaml b/agent-langgraph-advanced/app.yaml index 24ed65cc..1e406cb4 100644 --- a/agent-langgraph-advanced/app.yaml +++ b/agent-langgraph-advanced/app.yaml @@ -6,6 +6,8 @@ env: value: "databricks" - name: MLFLOW_REGISTRY_URI value: "databricks-uc" + - name: API_PROXY + value: "http://localhost:8000/invocations" - name: CHAT_APP_PORT value: "3000" - name: CHAT_PROXY_TIMEOUT_SECONDS diff --git a/agent-langgraph-advanced/databricks.yml b/agent-langgraph-advanced/databricks.yml index 2c826345..1ba990b3 100644 --- a/agent-langgraph-advanced/databricks.yml +++ b/agent-langgraph-advanced/databricks.yml @@ -14,6 +14,8 @@ resources: value: "databricks" - name: MLFLOW_REGISTRY_URI value: "databricks-uc" + - name: API_PROXY + value: "http://localhost:8000/invocations" - name: CHAT_APP_PORT value: "3000" - name: CHAT_PROXY_TIMEOUT_SECONDS diff --git a/agent-openai-advanced/app.yaml b/agent-openai-advanced/app.yaml index 24ed65cc..1e406cb4 100644 --- a/agent-openai-advanced/app.yaml +++ b/agent-openai-advanced/app.yaml @@ -6,6 +6,8 @@ env: value: "databricks" - name: MLFLOW_REGISTRY_URI value: "databricks-uc" + - name: API_PROXY + value: "http://localhost:8000/invocations" - name: CHAT_APP_PORT value: "3000" - name: CHAT_PROXY_TIMEOUT_SECONDS diff --git a/agent-openai-advanced/databricks.yml b/agent-openai-advanced/databricks.yml index c943a4e7..9c5b3c39 100644 --- a/agent-openai-advanced/databricks.yml +++ b/agent-openai-advanced/databricks.yml @@ -15,6 +15,8 @@ resources: value: "databricks" - name: MLFLOW_REGISTRY_URI value: "databricks-uc" + - name: API_PROXY + value: "http://localhost:8000/invocations" - name: CHAT_APP_PORT value: "3000" - name: CHAT_PROXY_TIMEOUT_SECONDS From 0517e23c5b6e8e3efae8752f0b79f6860ef798ba Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Thu, 23 Apr 2026 18:20:39 +0000 Subject: [PATCH 38/47] Fold getApiProxyUrl into request-context.ts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both helpers answer routing-decision questions for the provider layer (proxy URL + context-injection gate), and the separate file wasn't buying isolation — providers-server.ts already imports from request-context.ts. One file, same logic. Co-authored-by: Isaac --- .../packages/ai-sdk-providers/src/api-proxy.ts | 18 ------------------ .../ai-sdk-providers/src/providers-server.ts | 7 ++++--- .../ai-sdk-providers/src/request-context.ts | 17 ++++++++++++++++- 3 files changed, 20 insertions(+), 22 deletions(-) delete mode 100644 e2e-chatbot-app-next/packages/ai-sdk-providers/src/api-proxy.ts diff --git a/e2e-chatbot-app-next/packages/ai-sdk-providers/src/api-proxy.ts b/e2e-chatbot-app-next/packages/ai-sdk-providers/src/api-proxy.ts deleted file mode 100644 index 203abd55..00000000 --- a/e2e-chatbot-app-next/packages/ai-sdk-providers/src/api-proxy.ts +++ /dev/null @@ -1,18 +0,0 @@ -/** - * Single source of truth for the "route chat through the local /invocations - * proxy" decision. Returns the URL the AI SDK provider should POST to, or - * undefined when the chatbot should talk directly to a Databricks serving - * endpoint. - * - * Resolution order: - * 1. Explicit ``API_PROXY`` env var — caller knows best. - * 2. ``DATABRICKS_SERVING_ENDPOINT`` set — direct-endpoint mode; no proxy. - * 3. Advanced-template default — assume a local FastAPI agent is reachable - * via this Node server's own /invocations route, and route there. - */ -export function getApiProxyUrl(): string | undefined { - if (process.env.API_PROXY) return process.env.API_PROXY; - if (process.env.DATABRICKS_SERVING_ENDPOINT) return undefined; - const port = process.env.CHAT_APP_PORT || process.env.PORT || '3000'; - return `http://localhost:${port}/invocations`; -} diff --git a/e2e-chatbot-app-next/packages/ai-sdk-providers/src/providers-server.ts b/e2e-chatbot-app-next/packages/ai-sdk-providers/src/providers-server.ts index 5d9220e7..d2e7d6d1 100644 --- a/e2e-chatbot-app-next/packages/ai-sdk-providers/src/providers-server.ts +++ b/e2e-chatbot-app-next/packages/ai-sdk-providers/src/providers-server.ts @@ -10,7 +10,10 @@ import { } from '@chat-template/auth'; import { createDatabricksProvider } from '@databricks/ai-sdk-provider'; import { extractReasoningMiddleware, wrapLanguageModel } from 'ai'; -import { shouldInjectContextForEndpoint } from './request-context'; +import { + getApiProxyUrl, + shouldInjectContextForEndpoint, +} from './request-context'; // Header keys for passing context through streamText headers export const CONTEXT_HEADER_CONVERSATION_ID = 'x-databricks-conversation-id'; @@ -71,8 +74,6 @@ export async function getWorkspaceHostname(): Promise { // Environment variable to enable SSE logging const LOG_SSE_EVENTS = process.env.LOG_SSE_EVENTS === 'true'; -import { getApiProxyUrl } from './api-proxy'; - const API_PROXY = getApiProxyUrl(); // Cache for endpoint details to check task type and OBO scopes diff --git a/e2e-chatbot-app-next/packages/ai-sdk-providers/src/request-context.ts b/e2e-chatbot-app-next/packages/ai-sdk-providers/src/request-context.ts index 52cd3788..6980322a 100644 --- a/e2e-chatbot-app-next/packages/ai-sdk-providers/src/request-context.ts +++ b/e2e-chatbot-app-next/packages/ai-sdk-providers/src/request-context.ts @@ -2,7 +2,22 @@ * Utility functions for request context handling. */ -import { getApiProxyUrl } from './api-proxy'; +/** + * Resolve the URL the AI SDK provider should POST to, or ``undefined`` when + * the chatbot should talk directly to a Databricks serving endpoint. + * + * Resolution order: + * 1. Explicit ``API_PROXY`` env var — caller knows best. + * 2. ``DATABRICKS_SERVING_ENDPOINT`` set → direct-endpoint mode; no proxy. + * 3. Advanced-template default → route via this Node server's own + * ``/invocations`` proxy. + */ +export function getApiProxyUrl(): string | undefined { + if (process.env.API_PROXY) return process.env.API_PROXY; + if (process.env.DATABRICKS_SERVING_ENDPOINT) return undefined; + const port = process.env.CHAT_APP_PORT || process.env.PORT || '3000'; + return `http://localhost:${port}/invocations`; +} /** * Determines whether context should be injected based on endpoint type. From 0d308271590ffcb167774de0b9d9e556c945e635 Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Tue, 28 Apr 2026 00:07:44 +0000 Subject: [PATCH 39/47] Templates: pin to prose-recovery bridge branch + LangGraph thread_id surface Companion to databricks-ai-bridge#425 (POC for prose-recovery + always-rotate durable-resume). Minimal template-side changes: - agent-{openai,langgraph}-advanced/pyproject.toml: switch the databricks-ai-bridge / databricks-openai / databricks-langchain branch pins from `dhruv0811/durable-execution-resume` (the structured-repair PR #416) to `dhruv0811/durable-execution-prose-recovery` (the new POC). - agent-langgraph-advanced/agent_server/agent.py: invoke_handler now returns the resolved `thread_id` in `custom_outputs`. After a crash + resume, the bridge rotates `context.conversation_id` to `{base}::attempt-N`. Surfacing it here lets the client pass it back as `custom_inputs.thread_id` on the next turn, so subsequent turns land on the rotated (clean) checkpointer row instead of the orphan-poisoned original. The OpenAI template already does this via `session.session_id` in custom_outputs; LangGraph just didn't. Status ====== POC for review alongside databricks-ai-bridge#425. Not intended to merge unless empirical data justifies the trade vs PR #195. Co-authored-by: Isaac --- agent-langgraph-advanced/agent_server/agent.py | 7 ++++++- agent-langgraph-advanced/pyproject.toml | 4 ++-- agent-openai-advanced/pyproject.toml | 4 ++-- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/agent-langgraph-advanced/agent_server/agent.py b/agent-langgraph-advanced/agent_server/agent.py index 98e121b9..f86a472a 100644 --- a/agent-langgraph-advanced/agent_server/agent.py +++ b/agent-langgraph-advanced/agent_server/agent.py @@ -89,7 +89,12 @@ async def invoke_handler(request: ResponsesAgentRequest) -> ResponsesAgentRespon if event.type == "response.output_item.done" ] - custom_outputs: dict[str, Any] = {} + # Surface the resolved thread_id so always-rotate works cross-turn: after a + # crash + resume, the bridge rotates `context.conversation_id` to + # `{base}::attempt-N`. Returning that value here lets the client send it as + # `custom_inputs.thread_id` on the next turn, so subsequent turns land on + # the rotated (clean) checkpointer row instead of the orphan-poisoned one. + custom_outputs: dict[str, Any] = {"thread_id": _get_or_create_thread_id(request)} if user_id := get_user_id(request): custom_outputs["user_id"] = user_id return ResponsesAgentResponse(output=outputs, custom_outputs=custom_outputs) diff --git a/agent-langgraph-advanced/pyproject.toml b/agent-langgraph-advanced/pyproject.toml index cf326392..3973fada 100644 --- a/agent-langgraph-advanced/pyproject.toml +++ b/agent-langgraph-advanced/pyproject.toml @@ -42,8 +42,8 @@ default-groups = ["dev", "setup"] # TEMPORARY: point at the open PR branch while ML-64230 durable-resume # changes are in review. Revert to the registry release once merged. [tool.uv.sources] -databricks-ai-bridge = { git = "https://github.com/databricks/databricks-ai-bridge", branch = "dhruv0811/durable-execution-resume" } -databricks-langchain = { git = "https://github.com/databricks/databricks-ai-bridge", branch = "dhruv0811/durable-execution-resume", subdirectory = "integrations/langchain" } +databricks-ai-bridge = { git = "https://github.com/databricks/databricks-ai-bridge", branch = "dhruv0811/durable-execution-prose-recovery" } +databricks-langchain = { git = "https://github.com/databricks/databricks-ai-bridge", branch = "dhruv0811/durable-execution-prose-recovery", subdirectory = "integrations/langchain" } [tool.pytest.ini_options] diff --git a/agent-openai-advanced/pyproject.toml b/agent-openai-advanced/pyproject.toml index 87f62deb..84a6f52b 100644 --- a/agent-openai-advanced/pyproject.toml +++ b/agent-openai-advanced/pyproject.toml @@ -46,8 +46,8 @@ default-groups = ["dev", "setup"] # TEMPORARY: point at the open PR branch while ML-64230 durable-resume # changes are in review. Revert to the registry release once merged. [tool.uv.sources] -databricks-ai-bridge = { git = "https://github.com/databricks/databricks-ai-bridge", branch = "dhruv0811/durable-execution-resume" } -databricks-openai = { git = "https://github.com/databricks/databricks-ai-bridge", branch = "dhruv0811/durable-execution-resume", subdirectory = "integrations/openai" } +databricks-ai-bridge = { git = "https://github.com/databricks/databricks-ai-bridge", branch = "dhruv0811/durable-execution-prose-recovery" } +databricks-openai = { git = "https://github.com/databricks/databricks-ai-bridge", branch = "dhruv0811/durable-execution-prose-recovery", subdirectory = "integrations/openai" } [tool.pytest.ini_options] base_url = "http://localhost:8000" From 140399b8f9009d65facf2552ced7587ecc8dba09 Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Tue, 28 Apr 2026 01:02:03 +0000 Subject: [PATCH 40/47] Chatbot: capture rotated conversation_id from response.resumed sentinel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Companion to databricks-ai-bridge#425 prose-recovery design. The bridge's always-rotate flow rotates `context.conversation_id` to `{base}::attempt-N` on every durable-resume and emits the rotated value in the `response.resumed` SSE event. This patch: - Maintains an in-memory `Map` in the shared AI-SDK provider's databricksFetch. - Captures the rotation by sniffing the SSE response for `response.resumed { conversation_id: ... }` events. - On subsequent requests for the same chat, swaps the rotated value into `context.conversation_id` before forwarding. Net effect: turn N+1 after a crash lands on the rotated (clean) SDK session instead of the orphan-poisoned original — closing the multi-turn gap without requiring SDK adapter wrappers in the bridge. In-memory only (single Express process). A multi-pod deployment would persist this on the chat row. Co-authored-by: Isaac --- .../ai-sdk-providers/src/providers-server.ts | 110 +++++++++++++++++- 1 file changed, 107 insertions(+), 3 deletions(-) diff --git a/e2e-chatbot-app-next/packages/ai-sdk-providers/src/providers-server.ts b/e2e-chatbot-app-next/packages/ai-sdk-providers/src/providers-server.ts index d2e7d6d1..da8044ae 100644 --- a/e2e-chatbot-app-next/packages/ai-sdk-providers/src/providers-server.ts +++ b/e2e-chatbot-app-next/packages/ai-sdk-providers/src/providers-server.ts @@ -76,6 +76,18 @@ const LOG_SSE_EVENTS = process.env.LOG_SSE_EVENTS === 'true'; const API_PROXY = getApiProxyUrl(); +// Always-rotate alias map: chat_id (the conversation_id the client uses) → +// rotated conversation_id from the most recent durable-resume. +// +// When the bridge resumes a crashed run, it rotates `context.conversation_id` +// to `{base}::attempt-N` and emits `response.resumed { conversation_id: ... }` +// in the SSE stream. We capture that rotation here so subsequent turns from +// the same chat send the rotated value as `context.conversation_id`, landing +// on the rotated (clean) SDK session instead of the original orphan-poisoned +// row. In-memory only — fine for a single Express process; a multi-pod +// deployment would persist this in the chat row. +const rotatedConversationIdByChat = new Map(); + // Cache for endpoint details to check task type and OBO scopes const endpointDetailsCache = new Map< string, @@ -99,6 +111,67 @@ function shouldInjectContext(): boolean { return shouldInjectContextForEndpoint(endpointTask); } +// Wrap an SSE response body so we can sniff `response.resumed` events and +// remember the rotated conversation_id for the originating chat. The original +// stream is passed through untouched to the AI SDK consumer. +function wrapResponseToCaptureRotation( + response: Response, + chatId: string, +): Response { + const originalBody = response.body!; + const reader = originalBody.getReader(); + const decoder = new TextDecoder(); + let buffer = ''; + + const wrapped = new ReadableStream({ + async pull(controller) { + const { done, value } = await reader.read(); + if (done) { + controller.close(); + return; + } + controller.enqueue(value); + + // Parse data: lines for response.resumed events. Buffered because + // SSE frames can split across chunks. + buffer += decoder.decode(value, { stream: true }); + const lines = buffer.split('\n'); + buffer = lines.pop() || ''; + for (const line of lines) { + if (!line.startsWith('data:')) continue; + const data = line.slice(5).trim(); + if (!data || data === '[DONE]') continue; + try { + const evt = JSON.parse(data); + if ( + evt?.type === 'response.resumed' && + typeof evt.conversation_id === 'string' + ) { + const prior = rotatedConversationIdByChat.get(chatId); + if (prior !== evt.conversation_id) { + rotatedConversationIdByChat.set(chatId, evt.conversation_id); + console.log( + `[durable-alias] chat ${chatId} ← rotated conversation_id ${evt.conversation_id} captured from response.resumed`, + ); + } + } + } catch { + // Non-JSON data line — ignore. + } + } + }, + cancel() { + reader.cancel(); + }, + }); + + return new Response(wrapped, { + status: response.status, + statusText: response.statusText, + headers: response.headers, + }); +} + // Custom fetch function to transform Databricks responses to OpenAI format export const databricksFetch: typeof fetch = async (input, init) => { const url = input.toString(); @@ -113,9 +186,26 @@ export const databricksFetch: typeof fetch = async (input, init) => { headers.delete(CONTEXT_HEADER_USER_ID); requestInit = { ...requestInit, headers }; - // Inject context into request body if appropriate + // Resolve the always-rotate alias: if a prior turn for this chat ended on + // a rotated conversation_id (durable-resume), use the rotated value so we + // land on the clean rotated SDK session instead of the orphan-poisoned + // original. + const effectiveConversationId = + (conversationId && rotatedConversationIdByChat.get(conversationId)) || + conversationId; if ( + effectiveConversationId && conversationId && + effectiveConversationId !== conversationId + ) { + console.log( + `[durable-alias] chat ${conversationId} → using rotated conversation_id ${effectiveConversationId}`, + ); + } + + // Inject context into request body if appropriate + if ( + effectiveConversationId && userId && requestInit?.body && typeof requestInit.body === 'string' @@ -127,7 +217,7 @@ export const databricksFetch: typeof fetch = async (input, init) => { ...body, context: { ...body.context, - conversation_id: conversationId, + conversation_id: effectiveConversationId, user_id: userId, }, }; @@ -162,7 +252,21 @@ export const databricksFetch: typeof fetch = async (input, init) => { } } - const response = await fetch(url, requestInit); + let response = await fetch(url, requestInit); + + // Capture rotated conversation_id from the SSE stream's response.resumed + // sentinel so subsequent requests for the same chat use the rotated value + // (always-rotate alias). Wraps the body once; the SSE-logging wrap below + // composes on top if enabled. + if (response.body && conversationId) { + const contentType = response.headers.get('content-type') || ''; + if ( + contentType.includes('text/event-stream') || + contentType.includes('application/x-ndjson') + ) { + response = wrapResponseToCaptureRotation(response, conversationId); + } + } // If SSE logging is enabled and this is a streaming response, wrap the body to log events if (LOG_SSE_EVENTS && response.body) { From 31d87d60ad98d89d7bb124757323e5df765e20e5 Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Tue, 28 Apr 2026 01:15:33 +0000 Subject: [PATCH 41/47] agent-openai-advanced: trust session as authoritative for cross-turn dedup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous heuristic compared `session_items >= messages - 1` to decide whether to forward only the latest user message. Under prose-recovery + always-rotate, the rotated session has FEWER items than the chatbot's accumulated UI echo (attempt 2's session is fresh; UI accumulated events from both attempts), so the heuristic was returning all messages, including duplicates of attempt 2's tool_calls and the orphan from attempt 1. The Runner then combined session+input, producing duplicate function_call items that the OpenAI SDK groups into a malformed assistant.tool_calls block — Anthropic 400 with "tool_call_ids did not have response messages". Fix: if the session has any items at all, treat it as the authoritative source of cross-turn history and only forward the new user message. First-turn path (empty session) still returns the full input. Co-authored-by: Isaac --- agent-openai-advanced/agent_server/utils.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/agent-openai-advanced/agent_server/utils.py b/agent-openai-advanced/agent_server/utils.py index bec8919c..645f8126 100644 --- a/agent-openai-advanced/agent_server/utils.py +++ b/agent-openai-advanced/agent_server/utils.py @@ -204,7 +204,13 @@ async def deduplicate_input(request: ResponsesAgentRequest, session: AsyncDatabr ): msg["content"] = [{"type": "output_text", "text": msg["content"], "annotations": []}] session_items = await session.get_items() - if len(session_items) >= len(messages) - 1: + # If the session has any items, treat it as authoritative for prior-turn + # history and only forward the latest message (the new user turn). The + # prior count-based heuristic broke under prose-recovery + always-rotate: + # the rotated session can have fewer items than the chatbot's accumulated + # UI echo, leading the heuristic to forward duplicates that the SDK then + # groups into a malformed assistant.tool_calls block. + if session_items and len(messages) > 1: return [messages[-1]] return messages From 1aee3af133e3b28b5f8cbca167d188ce4ca083f6 Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Tue, 28 Apr 2026 01:31:22 +0000 Subject: [PATCH 42/47] agent-langgraph-advanced: trust checkpointer state as authoritative for cross-turn dedup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirrors the same fix applied to agent-openai-advanced/utils.py (deduplicate_input). When the rotated checkpointer state exists for the current thread_id, only forward the latest user message from the chatbot's request input. Without this, the chatbot's full-history echo (including any orphan tool_use AIMessage from a crashed attempt 1 that the rotated checkpointer doesn't have) would be merged into state via `add_messages` and poison the next LLM call with an unpaired tool_use. Closes the multi-turn gap on the LangGraph side. The bridge (databricks-ai-bridge#425) no longer needs the input sanitizer (`tool_repair.py` + `_sanitize_request_input`) — between this LangGraph dedup and the OpenAI session-as-authoritative dedup, both templates handle UI-echo cleanly. Co-authored-by: Isaac --- .../agent_server/agent.py | 79 ++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) diff --git a/agent-langgraph-advanced/agent_server/agent.py b/agent-langgraph-advanced/agent_server/agent.py index f86a472a..20fa267d 100644 --- a/agent-langgraph-advanced/agent_server/agent.py +++ b/agent-langgraph-advanced/agent_server/agent.py @@ -1,3 +1,4 @@ +import asyncio import logging from datetime import datetime from typing import Any, AsyncGenerator, Optional, Sequence, TypedDict @@ -50,6 +51,57 @@ def get_current_time() -> str: return datetime.now().isoformat() +@tool +def get_weather(city: str) -> str: + """Return a short weather summary for the given city.""" + stubs = { + "new york": "72°F, partly cloudy, light wind", + "los angeles": "78°F, sunny, mild humidity", + "tokyo": "65°F, rain, chance of thunderstorms", + "paris": "60°F, overcast, occasional drizzle", + "london": "55°F, foggy, light rain", + "sydney": "82°F, sunny, breezy", + } + return stubs.get(city.lower(), f"70°F, clear skies (stub for {city})") + + +@tool +def get_stock_price(ticker: str) -> str: + """Return a simulated stock price for the given ticker symbol.""" + stubs = { + "AAPL": "$187.42 (+1.2%)", + "GOOGL": "$141.78 (-0.4%)", + "MSFT": "$415.06 (+0.8%)", + "NVDA": "$885.91 (+2.7%)", + "TSLA": "$204.33 (-1.5%)", + } + return stubs.get(ticker.upper(), f"$100.00 (stub for {ticker.upper()})") + + +@tool +def search_best_restaurants(city: str) -> str: + """Find a short list of notable restaurants in the given city.""" + stubs = { + "paris": "Le Comptoir du Relais, Septime, Chez L'Ami Jean", + "tokyo": "Sukiyabashi Jiro, Narisawa, Den", + "new york": "Eleven Madison Park, Le Bernardin, Daniel", + } + return stubs.get( + city.lower(), f"Local favorites in {city}: Cafe One, The Bistro, Riverside Kitchen" + ) + + +@tool +async def deep_research(topic: str) -> str: + """Run an in-depth multi-source research on the given topic. Takes ~15 seconds.""" + await asyncio.sleep(15) + return ( + f"Research summary on '{topic}': key findings include " + "historical context, current consensus, and two leading " + "counter-arguments. (stubbed 15s simulated research)" + ) + + class StatefulAgentState(TypedDict, total=False): messages: Annotated[Sequence[AnyMessage], add_messages] custom_inputs: dict[str, Any] @@ -61,7 +113,13 @@ async def init_agent( workspace_client: Optional[WorkspaceClient] = None, checkpointer: Optional[Any] = None, ): - tools = [get_current_time] + memory_tools() + tools = [ + get_current_time, + get_weather, + get_stock_price, + search_best_restaurants, + deep_research, + ] + memory_tools() # To use MCP server tools instead, uncomment the below lines: # mcp_client = init_mcp_client(workspace_client or sp_workspace_client) # try: @@ -128,6 +186,25 @@ async def stream_handler( # For on-behalf-of user authentication, pass get_user_workspace_client() to init_agent. agent = await init_agent(store=store, checkpointer=checkpointer) + # Always-rotate dedup: when the checkpointer has state for this + # (rotated) thread_id, trust it as authoritative for prior-turn + # history and only forward the latest user message. Without this + # the chatbot's full-history echo would re-inject prior turns — + # including any orphan tool_use messages from a crashed attempt 1 + # that the rotated checkpointer state doesn't have — and + # `add_messages` would append them, poisoning the LLM call. + state = await agent.aget_state(config) + if state and state.values.get("messages") and input_state["messages"]: + last_user = next( + ( + m + for m in reversed(input_state["messages"]) + if m.get("role") == "user" + ), + None, + ) + input_state["messages"] = [last_user] if last_user else [] + async for event in process_agent_astream_events( agent.astream(input_state, config, stream_mode=["updates", "messages"]) ): From dfa14ce0929d3ee3b8dabc98afc93f2cda1fbdd9 Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Tue, 28 Apr 2026 01:45:12 +0000 Subject: [PATCH 43/47] Revert per-template dedup hooks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cross-turn echo dedup is now handled SDK-agnostically inside the bridge via _trim_echoed_history (databricks-ai-bridge#425). Both templates' agent.py / utils.py go back to main — no per-SDK calls into session.get_items() / agent.aget_state(), no thread_id surface in custom_outputs. The remaining template-side change for the always-rotate flow is e2e-chatbot-app-next/packages/ai-sdk-providers/src/providers-server.ts (alias capture from response.resumed sentinel + injection on outgoing requests). Co-authored-by: Isaac --- .../agent_server/agent.py | 88 +------------------ agent-openai-advanced/agent_server/utils.py | 17 ++-- 2 files changed, 10 insertions(+), 95 deletions(-) diff --git a/agent-langgraph-advanced/agent_server/agent.py b/agent-langgraph-advanced/agent_server/agent.py index 20fa267d..22d0e8bc 100644 --- a/agent-langgraph-advanced/agent_server/agent.py +++ b/agent-langgraph-advanced/agent_server/agent.py @@ -1,4 +1,3 @@ -import asyncio import logging from datetime import datetime from typing import Any, AsyncGenerator, Optional, Sequence, TypedDict @@ -41,7 +40,7 @@ logging.getLogger("mlflow.utils.autologging_utils").setLevel(logging.ERROR) sp_workspace_client = WorkspaceClient() -LLM_ENDPOINT_NAME = "databricks-claude-sonnet-4-5" +LLM_ENDPOINT_NAME = "databricks-gpt-5-2" LAKEBASE_CONFIG = init_lakebase_config() @@ -51,57 +50,6 @@ def get_current_time() -> str: return datetime.now().isoformat() -@tool -def get_weather(city: str) -> str: - """Return a short weather summary for the given city.""" - stubs = { - "new york": "72°F, partly cloudy, light wind", - "los angeles": "78°F, sunny, mild humidity", - "tokyo": "65°F, rain, chance of thunderstorms", - "paris": "60°F, overcast, occasional drizzle", - "london": "55°F, foggy, light rain", - "sydney": "82°F, sunny, breezy", - } - return stubs.get(city.lower(), f"70°F, clear skies (stub for {city})") - - -@tool -def get_stock_price(ticker: str) -> str: - """Return a simulated stock price for the given ticker symbol.""" - stubs = { - "AAPL": "$187.42 (+1.2%)", - "GOOGL": "$141.78 (-0.4%)", - "MSFT": "$415.06 (+0.8%)", - "NVDA": "$885.91 (+2.7%)", - "TSLA": "$204.33 (-1.5%)", - } - return stubs.get(ticker.upper(), f"$100.00 (stub for {ticker.upper()})") - - -@tool -def search_best_restaurants(city: str) -> str: - """Find a short list of notable restaurants in the given city.""" - stubs = { - "paris": "Le Comptoir du Relais, Septime, Chez L'Ami Jean", - "tokyo": "Sukiyabashi Jiro, Narisawa, Den", - "new york": "Eleven Madison Park, Le Bernardin, Daniel", - } - return stubs.get( - city.lower(), f"Local favorites in {city}: Cafe One, The Bistro, Riverside Kitchen" - ) - - -@tool -async def deep_research(topic: str) -> str: - """Run an in-depth multi-source research on the given topic. Takes ~15 seconds.""" - await asyncio.sleep(15) - return ( - f"Research summary on '{topic}': key findings include " - "historical context, current consensus, and two leading " - "counter-arguments. (stubbed 15s simulated research)" - ) - - class StatefulAgentState(TypedDict, total=False): messages: Annotated[Sequence[AnyMessage], add_messages] custom_inputs: dict[str, Any] @@ -113,13 +61,7 @@ async def init_agent( workspace_client: Optional[WorkspaceClient] = None, checkpointer: Optional[Any] = None, ): - tools = [ - get_current_time, - get_weather, - get_stock_price, - search_best_restaurants, - deep_research, - ] + memory_tools() + tools = [get_current_time] + memory_tools() # To use MCP server tools instead, uncomment the below lines: # mcp_client = init_mcp_client(workspace_client or sp_workspace_client) # try: @@ -147,12 +89,7 @@ async def invoke_handler(request: ResponsesAgentRequest) -> ResponsesAgentRespon if event.type == "response.output_item.done" ] - # Surface the resolved thread_id so always-rotate works cross-turn: after a - # crash + resume, the bridge rotates `context.conversation_id` to - # `{base}::attempt-N`. Returning that value here lets the client send it as - # `custom_inputs.thread_id` on the next turn, so subsequent turns land on - # the rotated (clean) checkpointer row instead of the orphan-poisoned one. - custom_outputs: dict[str, Any] = {"thread_id": _get_or_create_thread_id(request)} + custom_outputs: dict[str, Any] = {} if user_id := get_user_id(request): custom_outputs["user_id"] = user_id return ResponsesAgentResponse(output=outputs, custom_outputs=custom_outputs) @@ -186,25 +123,6 @@ async def stream_handler( # For on-behalf-of user authentication, pass get_user_workspace_client() to init_agent. agent = await init_agent(store=store, checkpointer=checkpointer) - # Always-rotate dedup: when the checkpointer has state for this - # (rotated) thread_id, trust it as authoritative for prior-turn - # history and only forward the latest user message. Without this - # the chatbot's full-history echo would re-inject prior turns — - # including any orphan tool_use messages from a crashed attempt 1 - # that the rotated checkpointer state doesn't have — and - # `add_messages` would append them, poisoning the LLM call. - state = await agent.aget_state(config) - if state and state.values.get("messages") and input_state["messages"]: - last_user = next( - ( - m - for m in reversed(input_state["messages"]) - if m.get("role") == "user" - ), - None, - ) - input_state["messages"] = [last_user] if last_user else [] - async for event in process_agent_astream_events( agent.astream(input_state, config, stream_mode=["updates", "messages"]) ): diff --git a/agent-openai-advanced/agent_server/utils.py b/agent-openai-advanced/agent_server/utils.py index 645f8126..7cd07e8c 100644 --- a/agent-openai-advanced/agent_server/utils.py +++ b/agent-openai-advanced/agent_server/utils.py @@ -29,6 +29,7 @@ class LakebaseConfig: autoscaling_endpoint: Optional[str] autoscaling_project: Optional[str] autoscaling_branch: Optional[str] + memory_schema: Optional[str] = None @property def description(self) -> str: @@ -103,13 +104,15 @@ def init_lakebase_config() -> LakebaseConfig: " Option 3 (provisioned): LAKEBASE_INSTANCE_NAME=\n" ) + memory_schema = os.getenv("LAKEBASE_AGENT_MEMORY_SCHEMA") or None + # Priority: endpoint > project+branch > instance_name (mutually exclusive in the library) if endpoint: - return LakebaseConfig(instance_name=None, autoscaling_endpoint=endpoint, autoscaling_project=None, autoscaling_branch=None) + return LakebaseConfig(instance_name=None, autoscaling_endpoint=endpoint, autoscaling_project=None, autoscaling_branch=None, memory_schema=memory_schema) elif has_autoscaling: - return LakebaseConfig(instance_name=None, autoscaling_endpoint=None, autoscaling_project=project, autoscaling_branch=branch) + return LakebaseConfig(instance_name=None, autoscaling_endpoint=None, autoscaling_project=project, autoscaling_branch=branch, memory_schema=memory_schema) else: - return LakebaseConfig(instance_name=resolve_lakebase_instance_name(raw_name), autoscaling_endpoint=None, autoscaling_project=None, autoscaling_branch=None) + return LakebaseConfig(instance_name=resolve_lakebase_instance_name(raw_name), autoscaling_endpoint=None, autoscaling_project=None, autoscaling_branch=None, memory_schema=memory_schema) def get_lakebase_access_error_message(lakebase_description: str) -> str: @@ -204,13 +207,7 @@ async def deduplicate_input(request: ResponsesAgentRequest, session: AsyncDatabr ): msg["content"] = [{"type": "output_text", "text": msg["content"], "annotations": []}] session_items = await session.get_items() - # If the session has any items, treat it as authoritative for prior-turn - # history and only forward the latest message (the new user turn). The - # prior count-based heuristic broke under prose-recovery + always-rotate: - # the rotated session can have fewer items than the chatbot's accumulated - # UI echo, leading the heuristic to forward duplicates that the SDK then - # groups into a malformed assistant.tool_calls block. - if session_items and len(messages) > 1: + if len(session_items) >= len(messages) - 1: return [messages[-1]] return messages From 018492f8bc28ee06603f4c3c89d1d66f7054e35e Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Tue, 28 Apr 2026 02:12:06 +0000 Subject: [PATCH 44/47] chatbot: drop non-null assertion in wrapResponseToCaptureRotation Replace `response.body!` with an explicit early-return guard. Functionally identical (the SSE check above already implies a body exists), but satisfies Biome's lint/style/noNonNullAssertion rule introduced by my prior commit. --- .../packages/ai-sdk-providers/src/providers-server.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/e2e-chatbot-app-next/packages/ai-sdk-providers/src/providers-server.ts b/e2e-chatbot-app-next/packages/ai-sdk-providers/src/providers-server.ts index da8044ae..28777b76 100644 --- a/e2e-chatbot-app-next/packages/ai-sdk-providers/src/providers-server.ts +++ b/e2e-chatbot-app-next/packages/ai-sdk-providers/src/providers-server.ts @@ -118,8 +118,10 @@ function wrapResponseToCaptureRotation( response: Response, chatId: string, ): Response { - const originalBody = response.body!; - const reader = originalBody.getReader(); + if (!response.body) { + return response; + } + const reader = response.body.getReader(); const decoder = new TextDecoder(); let buffer = ''; From 9d4af2058b85e8fbfe842a740034cd0b51a6d982 Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Tue, 28 Apr 2026 18:51:54 +0000 Subject: [PATCH 45/47] Revert agent_langgraph_memory from _MANAGED_SCHEMAS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per Bryan's review feedback, the test framework's _MANAGED_SCHEMAS list isn't the right layer for handling memory- schema permissions — that crosses a layer boundary into per- template configuration. The right shape is: * Workspace setup grants USAGE on workspace-managed schemas to the writer role; SPs inherit it automatically. * Per-template grant_lakebase_permissions.py owns its own table list and grants relative to whichever schema the agent is configured to use (via LAKEBASE_AGENT_MEMORY_SCHEMA env var). In the autoscaling test branch we already have: databricks_writer_16401=UC/... on agent_langgraph_memory which means new SPs created by the test deploy already have USAGE through role inheritance. Combined with the workspace-side ALTER DATABASE search_path that exposes the schema by default, the deployed app resolves the pgvector type without any test- framework grants on this schema. Co-authored-by: Isaac --- .scripts/agent-integration-tests/helpers.py | 312 ++++++++++++++++---- 1 file changed, 247 insertions(+), 65 deletions(-) diff --git a/.scripts/agent-integration-tests/helpers.py b/.scripts/agent-integration-tests/helpers.py index 259abfc2..1b21881f 100644 --- a/.scripts/agent-integration-tests/helpers.py +++ b/.scripts/agent-integration-tests/helpers.py @@ -25,40 +25,133 @@ BUNDLE_TIMEOUT = 600 # seconds for bundle deploy/run/destroy commands (10 min for parallel runs) QUICKSTART_TIMEOUT = 600 # seconds for quickstart command (10 min for parallel runs) EVALUATE_TIMEOUT = 900 # seconds for agent-evaluate -SERVER_START_TIMEOUT = 60 # seconds to wait for local server to start +SERVER_START_TIMEOUT = 600 # seconds to wait for local server to start (accommodates cold CI runners + heavy template imports) # --------------------------------------------------------------------------- # Logging & subprocess # --------------------------------------------------------------------------- +# Design: stdout is the human-readable stream (timestamps, ✓/✗ markers, +# collapsible GH Actions groups, short summaries on subprocess success). +# The per-thread log file (logs/{template}.log) gets the full unstructured +# text — including every subprocess's stdout/stderr — so post-mortem grep +# and paste-into-playbook workflows keep working unchanged. +# +# Two rules of thumb: +# * _log() prints to both stdout (with timestamp) and log file (raw). +# * _run_cmd: terse on stdout when successful, full detail on failure; +# always writes full detail to the log file. _thread_local = threading.local() _log_lock = threading.Lock() +# Only emit GH Actions log-grouping directives when running under Actions. +# Outside CI they'd just be noise in developer terminals. +_IN_CI = os.environ.get("GITHUB_ACTIONS") == "true" + def set_log_file(log_file: Path | None): """Set the log file for the current thread.""" _thread_local.log_file = log_file -def _log(msg: str): - """Write to the current thread's log file and stdout.""" - print(msg) +def _ts() -> str: + """Short HH:MM:SS timestamp for stdout log-line prefixes.""" + return time.strftime("%H:%M:%S") + + +def _fmt_duration(seconds: float) -> str: + """Human-friendly duration: '0.4s', '12.1s', '1m 23s', '1h 2m 3s'.""" + if seconds < 60: + return f"{seconds:.1f}s" + m, s = divmod(int(seconds), 60) + if m < 60: + return f"{m}m {s}s" + h, m = divmod(m, 60) + return f"{h}h {m}m {s}s" + + +def _write_to_log_file(msg: str) -> None: + """Append raw text (no timestamp) to the current thread's log file.""" log_file = getattr(_thread_local, "log_file", None) - if log_file: - with _log_lock, open(log_file, "a") as f: - f.write(msg + "\n") + if log_file is None: + return + with _log_lock, open(log_file, "a") as f: + f.write(msg + "\n") + + +def _log(msg: str) -> None: + """Log to stdout (with timestamp) and log file (raw). + + Multi-line messages: timestamp only on the first line so continuation + lines stay aligned and the file copy is byte-identical to the message. + """ + if msg == "": + print("") + else: + lines = msg.split("\n") + print(f"[{_ts()}] {lines[0]}") + for line in lines[1:]: + print(line) + _write_to_log_file(msg) -def _run_cmd(cmd: list[str], **kwargs) -> subprocess.CompletedProcess: - """Run a subprocess, log its output, and return the result.""" +def _gh_group(title: str) -> None: + """Open a collapsible GH Actions log group (no-op outside CI).""" + if _IN_CI: + print(f"::group::{title}") + + +def _gh_endgroup() -> None: + """Close the current GH Actions log group (no-op outside CI).""" + if _IN_CI: + print("::endgroup::") + + +def _run_cmd(cmd: list[str], *, verbose: bool = False, **kwargs) -> subprocess.CompletedProcess: + """Run a subprocess and return the result. + + Logging behaviour: + * Full command + exit + stdout + stderr always go to the log file. + * Stdout (the CI log stream) gets a one-line summary on success + — ``✓ ()`` — and full detail on failure. + * Pass ``verbose=True`` to force full output on both paths (useful + when the command's output is itself the test signal). + """ kwargs.setdefault("capture_output", True) kwargs.setdefault("text", True) - _log(f"$ {' '.join(cmd)}") - result = subprocess.run(cmd, **kwargs) - _log(f" exit={result.returncode}") + cmd_str = " ".join(cmd) + short_cmd = " ".join(cmd[:3]) + ("..." if len(cmd) > 3 else "") + t0 = time.monotonic() + + _write_to_log_file(f"$ {cmd_str}") + + try: + result = subprocess.run(cmd, **kwargs) + except subprocess.TimeoutExpired: + duration = time.monotonic() - t0 + # Make timeouts loud on stdout; the file gets the same line plus + # any partial output captured by the caller. + print(f"[{_ts()}] ✗ timeout: {cmd_str} (after {_fmt_duration(duration)})") + _write_to_log_file(f" TIMEOUT after {_fmt_duration(duration)}") + raise + + duration = time.monotonic() - t0 + + _write_to_log_file(f" exit={result.returncode} ({_fmt_duration(duration)})") if result.stdout: - _log(f" stdout:\n{result.stdout.rstrip()}") + _write_to_log_file(f" stdout:\n{result.stdout.rstrip()}") if result.stderr: - _log(f" stderr:\n{result.stderr.rstrip()}") + _write_to_log_file(f" stderr:\n{result.stderr.rstrip()}") + + if result.returncode == 0 and not verbose: + print(f"[{_ts()}] ✓ {short_cmd} ({_fmt_duration(duration)})") + else: + marker = "✓" if result.returncode == 0 else "✗" + print(f"[{_ts()}] {marker} {cmd_str} (exit {result.returncode}, {_fmt_duration(duration)})") + if result.stdout: + print(f" stdout:\n{result.stdout.rstrip()}") + if result.stderr: + print(f" stderr:\n{result.stderr.rstrip()}") + return result @@ -120,11 +213,18 @@ def copy_template(template_dir: Path, app_name_suffix: str = "-p") -> Path: yml_path = tmp_dir / "databricks.yml" if yml_path.exists(): text = yml_path.read_text() - # Patch bundle name so it uses a separate workspace path and terraform state - # e.g. bundle.name: "agent_langgraph_advanced" -> "agent_langgraph_advanced_p" + # Patch bundle.name (the first top-level `name:` in the file — + # unquoted identifier like `agent_langgraph_advanced`) so the + # copy gets its own workspace path .bundle// and its own + # terraform state. Previously the regex required quoted values + # and silently no-op'd on unquoted bundle.name, meaning the + # "isolated" copy actually shared state with the original — + # terraform state races and source-upload collisions ensued. + # e.g. ` name: agent_langgraph_advanced` -> + # ` name: agent_langgraph_advanced_p` suffix_underscore = app_name_suffix.replace("-", "_") patched = re.sub( - r'(^\s*name:\s*")([\w]+)(")', + r"^(\s*name:\s*)(\w+)(\s*)$", lambda m: m.group(1) + m.group(2) + suffix_underscore + m.group(3), text, count=1, @@ -160,16 +260,22 @@ def clean_template(template_dir: Path): pass # Already removed by another parallel worker -def uv_sync(template_dir: Path): +def uv_sync(template_dir: Path, max_attempts: int = 3): """Run `uv sync` to create/update the venv before quickstart. - Tries online first; falls back to UV_OFFLINE=true on failure (useful when - git+ deps are cached locally and the network fetch hangs or fails). + Retries up to ``max_attempts`` times with a short backoff to absorb + transient PyPI / proxy hiccups, then falls back to UV_OFFLINE=true + one more time as a last resort. """ - result = _run_cmd(["uv", "sync"], cwd=template_dir, timeout=QUICKSTART_TIMEOUT) - if result.returncode == 0: - return - _log(f" uv sync failed online, retrying with UV_OFFLINE=true...") + for attempt in range(1, max_attempts + 1): + result = _run_cmd(["uv", "sync"], cwd=template_dir, timeout=QUICKSTART_TIMEOUT) + if result.returncode == 0: + return + if attempt < max_attempts: + _log(f" uv sync attempt {attempt}/{max_attempts} failed, retrying in 10s...") + time.sleep(10) + + _log(f" uv sync failed online; falling back to UV_OFFLINE=true (cache-only)...") env = os.environ.copy() env["UV_OFFLINE"] = "true" result = _run_cmd(["uv", "sync"], cwd=template_dir, timeout=QUICKSTART_TIMEOUT, env=env) @@ -374,16 +480,8 @@ def find_free_port() -> int: return s.getsockname()[1] -def start_server(template_dir: Path, port: int = 0) -> tuple[subprocess.Popen, int]: - """Start `uv run start-server` as background process. - - If port is 0 (default), dynamically allocates a free port. - Waits for 'Uvicorn running on' in stderr (timeout 60s). - Returns (process handle, port). - """ - if port == 0: - port = find_free_port() - +def _start_server_once(template_dir: Path, port: int) -> tuple[subprocess.Popen, int]: + """Single attempt at starting `uv run start-server`. See start_server.""" _log(f"Starting server on port {port} in {template_dir.name}") proc = subprocess.Popen( ["uv", "run", "start-server", "--port", str(port)], @@ -415,6 +513,39 @@ def start_server(template_dir: Path, port: int = 0) -> tuple[subprocess.Popen, i raise TimeoutError(f"Server did not start within {SERVER_START_TIMEOUT} seconds") +def start_server(template_dir: Path, port: int = 0, max_attempts: int = 2) -> tuple[subprocess.Popen, int]: + """Start `uv run start-server` as a background process, with one retry. + + If port is 0, dynamically allocates a free port. Watches stderr for + ``Uvicorn running on`` or ``Application startup complete`` (timeout + SERVER_START_TIMEOUT per attempt). + + Retries once on timeout: we've observed uvicorn hanging between + "Started server process" and "Waiting for application startup" on GH + Actions runners for some templates (not deterministic — same template + passes on one run, hangs on the next). A second attempt from a fresh + process typically succeeds. + + Raises TimeoutError if all attempts time out; RuntimeError if the + server process exits early. Returns (process handle, port) on success. + """ + for attempt in range(1, max_attempts + 1): + allocated_port = port or find_free_port() + try: + return _start_server_once(template_dir, allocated_port) + except TimeoutError as exc: + if attempt >= max_attempts: + raise + _log( + f"start_server attempt {attempt}/{max_attempts} timed out; " + f"killing and retrying with a fresh process. " + f"({exc})" + ) + # fall through to next iteration — allocates a new port, + # spawns a new subprocess. + raise RuntimeError("start_server exited the retry loop without a result") # unreachable + + def stop_server(proc: subprocess.Popen): """Kill process group to ensure all children die.""" try: @@ -595,6 +726,8 @@ def bundle_deploy( - Terraform init failures (e.g. GitHub 502): wait and retry - "already exists" (app): unbind stale state + bind existing app, retry - "does not exist or is deleted": unbind stale reference, retry + - "lineage mismatch in state files": a prior run left stale terraform + state on the same bundle path. Unbind and wipe local state, retry. """ def recover(stderr: str, attempt: int, max_attempts: int) -> bool: @@ -642,6 +775,20 @@ def recover(stderr: str, attempt: int, max_attempts: int) -> bool: time.sleep(POLL_INTERVAL) return True + if "lineage mismatch" in stderr_flat: + _log( + f"bundle deploy attempt {attempt}/{max_attempts} failed in " + f"{template_dir.name} (tf state lineage mismatch), unbinding " + f"and wiping local state..." + ) + _bundle_unbind(template_dir, app_resource_key, profile) + # Remove local terraform state copy; a fresh deploy will repopulate. + databricks_state = template_dir / ".databricks" + if databricks_state.is_dir(): + shutil.rmtree(databricks_state, ignore_errors=True) + time.sleep(POLL_INTERVAL) + return True + if "is not terminal" in stderr_flat or "not terminal with state" in stderr_flat: _log( f"bundle deploy attempt {attempt}/{max_attempts} failed in " @@ -650,10 +797,17 @@ def recover(stderr: str, attempt: int, max_attempts: int) -> bool: time.sleep(POLL_INTERVAL) return True - if "DELETING" in stderr_flat: + # "Cannot update app as its compute is in state. App + # compute needs to be ACTIVE or STOPPED to update." — the app's + # compute is mid-transition (STARTING / DELETING / DEPLOYING / + # UPDATING) and can't accept a bundle update. Wait it out and + # retry. Match on the stable part of the error so it covers + # every transient state the API can emit. + if "ACTIVE or STOPPED to update" in stderr_flat: _log( f"bundle deploy attempt {attempt}/{max_attempts} failed in " - f"{template_dir.name} (compute deleting), waiting {POLL_INTERVAL}s..." + f"{template_dir.name} (app compute mid-transition), " + f"waiting {POLL_INTERVAL}s..." ) time.sleep(POLL_INTERVAL) return True @@ -673,37 +827,61 @@ def recover(stderr: str, attempt: int, max_attempts: int) -> bool: _restore_uv_sources(template_dir, pyproject_backup) -def bundle_run_nowait(template_dir: Path, resource_key: str, profile: str): +def bundle_run_nowait( + template_dir: Path, + resource_key: str, + profile: str, + app_name: str | None = None, +): """Trigger `databricks bundle run` to start the app, then return quickly. Despite --no-wait, the CLI may still poll briefly for startup. We cap the wait at 90s and swallow timeout errors — the app start was initiated on the server side and will continue even if the CLI process is killed. Use wait_for_app_ready() after this to poll until RUNNING. + + Recovery: if bundle run fails with "Invalid source code path ... does + not exist", the prior `bundle deploy` didn't fully upload the bundle + source (known failure mode when a previous run's cleanup left the app + definition bound but wiped its source dir). Re-run bundle_deploy to + force a fresh upload, then retry bundle_run once. ``app_name`` must be + provided to enable this recovery. """ import subprocess as _subprocess - try: - _run_cmd( - [ - "databricks", - "bundle", - "run", - resource_key, - "--no-wait", - "--target", - "dev", - "-p", - profile, - ], - cwd=template_dir, - timeout=BUNDLE_TIMEOUT, - ) - except _subprocess.TimeoutExpired: + cmd = [ + "databricks", "bundle", "run", resource_key, + "--no-wait", "--target", "dev", "-p", profile, + ] + for attempt in range(1, 3): + try: + result = _run_cmd(cmd, cwd=template_dir, timeout=BUNDLE_TIMEOUT) + except _subprocess.TimeoutExpired: + _log( + f"bundle run --no-wait for {resource_key} timed out after {BUNDLE_TIMEOUT}s " + f"— app start was initiated, polling via wait_for_app_ready()" + ) + return + + if result.returncode == 0: + return + + stderr_flat = " ".join(result.stderr.split()) + if "Invalid source code path" in stderr_flat and app_name and attempt == 1: + _log( + f"bundle run failed: source_code_path missing on workspace " + f"(likely stale state from prior run); re-deploying to " + f"re-upload source, then retrying..." + ) + bundle_deploy(template_dir, profile, resource_key, app_name) + continue + + # Other failure mode — log and let wait_for_app_ready surface it. _log( - f"bundle run --no-wait for {resource_key} timed out after {BUNDLE_TIMEOUT}s " - f"— app start was initiated, polling via wait_for_app_ready()" + f"bundle run --no-wait for {resource_key} exited {result.returncode}; " + f"proceeding to wait_for_app_ready (may time out)" ) + return def bundle_run(template_dir: Path, resource_key: str, profile: str): @@ -761,15 +939,19 @@ def bundle_destroy(template_dir: Path, profile: str): def get_oauth_token(profile: str) -> str: - """Get token from `databricks auth token -p `.""" - result = _run_cmd( - ["databricks", "auth", "token", "-p", profile], - timeout=60, - ) - assert result.returncode == 0, f"Failed to get auth token: {result.stderr}" - data = json.loads(result.stdout) - token = data.get("access_token", "") - assert token, "No access_token in auth response" + """Get an OAuth bearer token for the given Databricks CLI profile. + + Uses the Databricks SDK so this works for both U2M (personal OAuth) and + M2M (service-principal client_id/client_secret) profiles. The CLI's + `databricks auth token` subcommand only supports U2M, which breaks + CI runs against an SP profile. + """ + from databricks.sdk import WorkspaceClient + + w = WorkspaceClient(profile=profile) + auth_header = w.config.authenticate() + token = auth_header.get("Authorization", "").removeprefix("Bearer ").strip() + assert token, f"No OAuth token returned for profile {profile!r}" return token From 2a5c1cfe7689fbc97d611bfd26bad571b31d421c Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Tue, 28 Apr 2026 18:54:22 +0000 Subject: [PATCH 46/47] Revert "Revert agent_langgraph_memory from _MANAGED_SCHEMAS" This reverts commit 9d4af2058b85e8fbfe842a740034cd0b51a6d982. --- .scripts/agent-integration-tests/helpers.py | 312 ++++---------------- 1 file changed, 65 insertions(+), 247 deletions(-) diff --git a/.scripts/agent-integration-tests/helpers.py b/.scripts/agent-integration-tests/helpers.py index 1b21881f..259abfc2 100644 --- a/.scripts/agent-integration-tests/helpers.py +++ b/.scripts/agent-integration-tests/helpers.py @@ -25,133 +25,40 @@ BUNDLE_TIMEOUT = 600 # seconds for bundle deploy/run/destroy commands (10 min for parallel runs) QUICKSTART_TIMEOUT = 600 # seconds for quickstart command (10 min for parallel runs) EVALUATE_TIMEOUT = 900 # seconds for agent-evaluate -SERVER_START_TIMEOUT = 600 # seconds to wait for local server to start (accommodates cold CI runners + heavy template imports) +SERVER_START_TIMEOUT = 60 # seconds to wait for local server to start # --------------------------------------------------------------------------- # Logging & subprocess # --------------------------------------------------------------------------- -# Design: stdout is the human-readable stream (timestamps, ✓/✗ markers, -# collapsible GH Actions groups, short summaries on subprocess success). -# The per-thread log file (logs/{template}.log) gets the full unstructured -# text — including every subprocess's stdout/stderr — so post-mortem grep -# and paste-into-playbook workflows keep working unchanged. -# -# Two rules of thumb: -# * _log() prints to both stdout (with timestamp) and log file (raw). -# * _run_cmd: terse on stdout when successful, full detail on failure; -# always writes full detail to the log file. _thread_local = threading.local() _log_lock = threading.Lock() -# Only emit GH Actions log-grouping directives when running under Actions. -# Outside CI they'd just be noise in developer terminals. -_IN_CI = os.environ.get("GITHUB_ACTIONS") == "true" - def set_log_file(log_file: Path | None): """Set the log file for the current thread.""" _thread_local.log_file = log_file -def _ts() -> str: - """Short HH:MM:SS timestamp for stdout log-line prefixes.""" - return time.strftime("%H:%M:%S") - - -def _fmt_duration(seconds: float) -> str: - """Human-friendly duration: '0.4s', '12.1s', '1m 23s', '1h 2m 3s'.""" - if seconds < 60: - return f"{seconds:.1f}s" - m, s = divmod(int(seconds), 60) - if m < 60: - return f"{m}m {s}s" - h, m = divmod(m, 60) - return f"{h}h {m}m {s}s" - - -def _write_to_log_file(msg: str) -> None: - """Append raw text (no timestamp) to the current thread's log file.""" +def _log(msg: str): + """Write to the current thread's log file and stdout.""" + print(msg) log_file = getattr(_thread_local, "log_file", None) - if log_file is None: - return - with _log_lock, open(log_file, "a") as f: - f.write(msg + "\n") - - -def _log(msg: str) -> None: - """Log to stdout (with timestamp) and log file (raw). - - Multi-line messages: timestamp only on the first line so continuation - lines stay aligned and the file copy is byte-identical to the message. - """ - if msg == "": - print("") - else: - lines = msg.split("\n") - print(f"[{_ts()}] {lines[0]}") - for line in lines[1:]: - print(line) - _write_to_log_file(msg) + if log_file: + with _log_lock, open(log_file, "a") as f: + f.write(msg + "\n") -def _gh_group(title: str) -> None: - """Open a collapsible GH Actions log group (no-op outside CI).""" - if _IN_CI: - print(f"::group::{title}") - - -def _gh_endgroup() -> None: - """Close the current GH Actions log group (no-op outside CI).""" - if _IN_CI: - print("::endgroup::") - - -def _run_cmd(cmd: list[str], *, verbose: bool = False, **kwargs) -> subprocess.CompletedProcess: - """Run a subprocess and return the result. - - Logging behaviour: - * Full command + exit + stdout + stderr always go to the log file. - * Stdout (the CI log stream) gets a one-line summary on success - — ``✓ ()`` — and full detail on failure. - * Pass ``verbose=True`` to force full output on both paths (useful - when the command's output is itself the test signal). - """ +def _run_cmd(cmd: list[str], **kwargs) -> subprocess.CompletedProcess: + """Run a subprocess, log its output, and return the result.""" kwargs.setdefault("capture_output", True) kwargs.setdefault("text", True) - cmd_str = " ".join(cmd) - short_cmd = " ".join(cmd[:3]) + ("..." if len(cmd) > 3 else "") - t0 = time.monotonic() - - _write_to_log_file(f"$ {cmd_str}") - - try: - result = subprocess.run(cmd, **kwargs) - except subprocess.TimeoutExpired: - duration = time.monotonic() - t0 - # Make timeouts loud on stdout; the file gets the same line plus - # any partial output captured by the caller. - print(f"[{_ts()}] ✗ timeout: {cmd_str} (after {_fmt_duration(duration)})") - _write_to_log_file(f" TIMEOUT after {_fmt_duration(duration)}") - raise - - duration = time.monotonic() - t0 - - _write_to_log_file(f" exit={result.returncode} ({_fmt_duration(duration)})") + _log(f"$ {' '.join(cmd)}") + result = subprocess.run(cmd, **kwargs) + _log(f" exit={result.returncode}") if result.stdout: - _write_to_log_file(f" stdout:\n{result.stdout.rstrip()}") + _log(f" stdout:\n{result.stdout.rstrip()}") if result.stderr: - _write_to_log_file(f" stderr:\n{result.stderr.rstrip()}") - - if result.returncode == 0 and not verbose: - print(f"[{_ts()}] ✓ {short_cmd} ({_fmt_duration(duration)})") - else: - marker = "✓" if result.returncode == 0 else "✗" - print(f"[{_ts()}] {marker} {cmd_str} (exit {result.returncode}, {_fmt_duration(duration)})") - if result.stdout: - print(f" stdout:\n{result.stdout.rstrip()}") - if result.stderr: - print(f" stderr:\n{result.stderr.rstrip()}") - + _log(f" stderr:\n{result.stderr.rstrip()}") return result @@ -213,18 +120,11 @@ def copy_template(template_dir: Path, app_name_suffix: str = "-p") -> Path: yml_path = tmp_dir / "databricks.yml" if yml_path.exists(): text = yml_path.read_text() - # Patch bundle.name (the first top-level `name:` in the file — - # unquoted identifier like `agent_langgraph_advanced`) so the - # copy gets its own workspace path .bundle// and its own - # terraform state. Previously the regex required quoted values - # and silently no-op'd on unquoted bundle.name, meaning the - # "isolated" copy actually shared state with the original — - # terraform state races and source-upload collisions ensued. - # e.g. ` name: agent_langgraph_advanced` -> - # ` name: agent_langgraph_advanced_p` + # Patch bundle name so it uses a separate workspace path and terraform state + # e.g. bundle.name: "agent_langgraph_advanced" -> "agent_langgraph_advanced_p" suffix_underscore = app_name_suffix.replace("-", "_") patched = re.sub( - r"^(\s*name:\s*)(\w+)(\s*)$", + r'(^\s*name:\s*")([\w]+)(")', lambda m: m.group(1) + m.group(2) + suffix_underscore + m.group(3), text, count=1, @@ -260,22 +160,16 @@ def clean_template(template_dir: Path): pass # Already removed by another parallel worker -def uv_sync(template_dir: Path, max_attempts: int = 3): +def uv_sync(template_dir: Path): """Run `uv sync` to create/update the venv before quickstart. - Retries up to ``max_attempts`` times with a short backoff to absorb - transient PyPI / proxy hiccups, then falls back to UV_OFFLINE=true - one more time as a last resort. + Tries online first; falls back to UV_OFFLINE=true on failure (useful when + git+ deps are cached locally and the network fetch hangs or fails). """ - for attempt in range(1, max_attempts + 1): - result = _run_cmd(["uv", "sync"], cwd=template_dir, timeout=QUICKSTART_TIMEOUT) - if result.returncode == 0: - return - if attempt < max_attempts: - _log(f" uv sync attempt {attempt}/{max_attempts} failed, retrying in 10s...") - time.sleep(10) - - _log(f" uv sync failed online; falling back to UV_OFFLINE=true (cache-only)...") + result = _run_cmd(["uv", "sync"], cwd=template_dir, timeout=QUICKSTART_TIMEOUT) + if result.returncode == 0: + return + _log(f" uv sync failed online, retrying with UV_OFFLINE=true...") env = os.environ.copy() env["UV_OFFLINE"] = "true" result = _run_cmd(["uv", "sync"], cwd=template_dir, timeout=QUICKSTART_TIMEOUT, env=env) @@ -480,8 +374,16 @@ def find_free_port() -> int: return s.getsockname()[1] -def _start_server_once(template_dir: Path, port: int) -> tuple[subprocess.Popen, int]: - """Single attempt at starting `uv run start-server`. See start_server.""" +def start_server(template_dir: Path, port: int = 0) -> tuple[subprocess.Popen, int]: + """Start `uv run start-server` as background process. + + If port is 0 (default), dynamically allocates a free port. + Waits for 'Uvicorn running on' in stderr (timeout 60s). + Returns (process handle, port). + """ + if port == 0: + port = find_free_port() + _log(f"Starting server on port {port} in {template_dir.name}") proc = subprocess.Popen( ["uv", "run", "start-server", "--port", str(port)], @@ -513,39 +415,6 @@ def _start_server_once(template_dir: Path, port: int) -> tuple[subprocess.Popen, raise TimeoutError(f"Server did not start within {SERVER_START_TIMEOUT} seconds") -def start_server(template_dir: Path, port: int = 0, max_attempts: int = 2) -> tuple[subprocess.Popen, int]: - """Start `uv run start-server` as a background process, with one retry. - - If port is 0, dynamically allocates a free port. Watches stderr for - ``Uvicorn running on`` or ``Application startup complete`` (timeout - SERVER_START_TIMEOUT per attempt). - - Retries once on timeout: we've observed uvicorn hanging between - "Started server process" and "Waiting for application startup" on GH - Actions runners for some templates (not deterministic — same template - passes on one run, hangs on the next). A second attempt from a fresh - process typically succeeds. - - Raises TimeoutError if all attempts time out; RuntimeError if the - server process exits early. Returns (process handle, port) on success. - """ - for attempt in range(1, max_attempts + 1): - allocated_port = port or find_free_port() - try: - return _start_server_once(template_dir, allocated_port) - except TimeoutError as exc: - if attempt >= max_attempts: - raise - _log( - f"start_server attempt {attempt}/{max_attempts} timed out; " - f"killing and retrying with a fresh process. " - f"({exc})" - ) - # fall through to next iteration — allocates a new port, - # spawns a new subprocess. - raise RuntimeError("start_server exited the retry loop without a result") # unreachable - - def stop_server(proc: subprocess.Popen): """Kill process group to ensure all children die.""" try: @@ -726,8 +595,6 @@ def bundle_deploy( - Terraform init failures (e.g. GitHub 502): wait and retry - "already exists" (app): unbind stale state + bind existing app, retry - "does not exist or is deleted": unbind stale reference, retry - - "lineage mismatch in state files": a prior run left stale terraform - state on the same bundle path. Unbind and wipe local state, retry. """ def recover(stderr: str, attempt: int, max_attempts: int) -> bool: @@ -775,20 +642,6 @@ def recover(stderr: str, attempt: int, max_attempts: int) -> bool: time.sleep(POLL_INTERVAL) return True - if "lineage mismatch" in stderr_flat: - _log( - f"bundle deploy attempt {attempt}/{max_attempts} failed in " - f"{template_dir.name} (tf state lineage mismatch), unbinding " - f"and wiping local state..." - ) - _bundle_unbind(template_dir, app_resource_key, profile) - # Remove local terraform state copy; a fresh deploy will repopulate. - databricks_state = template_dir / ".databricks" - if databricks_state.is_dir(): - shutil.rmtree(databricks_state, ignore_errors=True) - time.sleep(POLL_INTERVAL) - return True - if "is not terminal" in stderr_flat or "not terminal with state" in stderr_flat: _log( f"bundle deploy attempt {attempt}/{max_attempts} failed in " @@ -797,17 +650,10 @@ def recover(stderr: str, attempt: int, max_attempts: int) -> bool: time.sleep(POLL_INTERVAL) return True - # "Cannot update app as its compute is in state. App - # compute needs to be ACTIVE or STOPPED to update." — the app's - # compute is mid-transition (STARTING / DELETING / DEPLOYING / - # UPDATING) and can't accept a bundle update. Wait it out and - # retry. Match on the stable part of the error so it covers - # every transient state the API can emit. - if "ACTIVE or STOPPED to update" in stderr_flat: + if "DELETING" in stderr_flat: _log( f"bundle deploy attempt {attempt}/{max_attempts} failed in " - f"{template_dir.name} (app compute mid-transition), " - f"waiting {POLL_INTERVAL}s..." + f"{template_dir.name} (compute deleting), waiting {POLL_INTERVAL}s..." ) time.sleep(POLL_INTERVAL) return True @@ -827,61 +673,37 @@ def recover(stderr: str, attempt: int, max_attempts: int) -> bool: _restore_uv_sources(template_dir, pyproject_backup) -def bundle_run_nowait( - template_dir: Path, - resource_key: str, - profile: str, - app_name: str | None = None, -): +def bundle_run_nowait(template_dir: Path, resource_key: str, profile: str): """Trigger `databricks bundle run` to start the app, then return quickly. Despite --no-wait, the CLI may still poll briefly for startup. We cap the wait at 90s and swallow timeout errors — the app start was initiated on the server side and will continue even if the CLI process is killed. Use wait_for_app_ready() after this to poll until RUNNING. - - Recovery: if bundle run fails with "Invalid source code path ... does - not exist", the prior `bundle deploy` didn't fully upload the bundle - source (known failure mode when a previous run's cleanup left the app - definition bound but wiped its source dir). Re-run bundle_deploy to - force a fresh upload, then retry bundle_run once. ``app_name`` must be - provided to enable this recovery. """ import subprocess as _subprocess - cmd = [ - "databricks", "bundle", "run", resource_key, - "--no-wait", "--target", "dev", "-p", profile, - ] - for attempt in range(1, 3): - try: - result = _run_cmd(cmd, cwd=template_dir, timeout=BUNDLE_TIMEOUT) - except _subprocess.TimeoutExpired: - _log( - f"bundle run --no-wait for {resource_key} timed out after {BUNDLE_TIMEOUT}s " - f"— app start was initiated, polling via wait_for_app_ready()" - ) - return - - if result.returncode == 0: - return - - stderr_flat = " ".join(result.stderr.split()) - if "Invalid source code path" in stderr_flat and app_name and attempt == 1: - _log( - f"bundle run failed: source_code_path missing on workspace " - f"(likely stale state from prior run); re-deploying to " - f"re-upload source, then retrying..." - ) - bundle_deploy(template_dir, profile, resource_key, app_name) - continue - - # Other failure mode — log and let wait_for_app_ready surface it. + try: + _run_cmd( + [ + "databricks", + "bundle", + "run", + resource_key, + "--no-wait", + "--target", + "dev", + "-p", + profile, + ], + cwd=template_dir, + timeout=BUNDLE_TIMEOUT, + ) + except _subprocess.TimeoutExpired: _log( - f"bundle run --no-wait for {resource_key} exited {result.returncode}; " - f"proceeding to wait_for_app_ready (may time out)" + f"bundle run --no-wait for {resource_key} timed out after {BUNDLE_TIMEOUT}s " + f"— app start was initiated, polling via wait_for_app_ready()" ) - return def bundle_run(template_dir: Path, resource_key: str, profile: str): @@ -939,19 +761,15 @@ def bundle_destroy(template_dir: Path, profile: str): def get_oauth_token(profile: str) -> str: - """Get an OAuth bearer token for the given Databricks CLI profile. - - Uses the Databricks SDK so this works for both U2M (personal OAuth) and - M2M (service-principal client_id/client_secret) profiles. The CLI's - `databricks auth token` subcommand only supports U2M, which breaks - CI runs against an SP profile. - """ - from databricks.sdk import WorkspaceClient - - w = WorkspaceClient(profile=profile) - auth_header = w.config.authenticate() - token = auth_header.get("Authorization", "").removeprefix("Bearer ").strip() - assert token, f"No OAuth token returned for profile {profile!r}" + """Get token from `databricks auth token -p `.""" + result = _run_cmd( + ["databricks", "auth", "token", "-p", profile], + timeout=60, + ) + assert result.returncode == 0, f"Failed to get auth token: {result.stderr}" + data = json.loads(result.stdout) + token = data.get("access_token", "") + assert token, "No access_token in auth response" return token From 8ecf2b7518938ee1bd7c32524fb0797f741b3a3c Mon Sep 17 00:00:00 2001 From: Dhruv Gupta Date: Thu, 30 Apr 2026 03:32:36 +0000 Subject: [PATCH 47/47] Move UI-echo dedup into per-template handlers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Companion to databricks-ai-bridge#425's removal of `_trim_echoed_history`. Per design discussion with Bryan, echo dedup is an agent-layer concern — the agent owns its SDK session/checkpointer and is the right layer to know what's already persisted vs what's a new turn. agent-openai-advanced/agent_server/utils.py - Update `deduplicate_input` heuristic from `len(session_items) >= len(messages) - 1` to `session_items and len(messages) > 1`. The old count-based check broke under prose-recovery + always-rotate (rotated session has fewer items than the chatbot's accumulated UI echo). The new check trusts the session as authoritative for prior turns whenever it has any items. agent-langgraph-advanced/agent_server/agent.py - Add `aget_state` probe in `stream_handler`. When the checkpointer already has messages for this thread, drop everything in input except the latest user message before passing to `agent.astream`. Without this, `add_messages` would append the chatbot's full-history echo — it dedupes by `id`, but MLflow's `responses_to_cc` doesn't preserve IDs, so dedup never fires across the bridge boundary. Both: ~10 lines per template, runs at the same point the SDK session read happens, no SDK adapter wrapping. Co-authored-by: Isaac --- agent-langgraph-advanced/agent_server/agent.py | 14 ++++++++++++++ agent-openai-advanced/agent_server/utils.py | 14 ++++++++++---- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/agent-langgraph-advanced/agent_server/agent.py b/agent-langgraph-advanced/agent_server/agent.py index 22d0e8bc..5ec6a1b4 100644 --- a/agent-langgraph-advanced/agent_server/agent.py +++ b/agent-langgraph-advanced/agent_server/agent.py @@ -123,6 +123,20 @@ async def stream_handler( # For on-behalf-of user authentication, pass get_user_workspace_client() to init_agent. agent = await init_agent(store=store, checkpointer=checkpointer) + # When the checkpointer already has prior turns for this thread, + # the chat client's full-history echo is redundant — `add_messages` + # would append duplicates (it dedupes by `id`, but MLflow's + # `responses_to_cc` doesn't preserve IDs, so dedup never fires). + # Forward only the latest user message; the checkpointer prepends + # the rest. + state = await agent.aget_state(config) + if state and state.values.get("messages") and input_state["messages"]: + last_user = next( + (m for m in reversed(input_state["messages"]) if m.get("role") == "user"), + None, + ) + input_state["messages"] = [last_user] if last_user else [] + async for event in process_agent_astream_events( agent.astream(input_state, config, stream_mode=["updates", "messages"]) ): diff --git a/agent-openai-advanced/agent_server/utils.py b/agent-openai-advanced/agent_server/utils.py index 7cd07e8c..54f60655 100644 --- a/agent-openai-advanced/agent_server/utils.py +++ b/agent-openai-advanced/agent_server/utils.py @@ -190,9 +190,12 @@ async def deduplicate_input(request: ResponsesAgentRequest, session: AsyncDatabr """Return the input messages to pass to the Runner, avoiding duplication with session history. When a client sends the full conversation history AND the session already has - that history persisted, passing everything through would duplicate messages. - If the session already covers the prior turns, only the latest message is needed - since the session will prepend the full history automatically. + that history persisted, passing everything through would duplicate messages + in the LLM call (Runner combines session items + input items, and the OpenAI + SDK's `_dedupe_key` doesn't dedupe role-bearing items — see + `agents/run_internal/items.py:224-250`). If the session already has any + items, the prior turns are persisted there and we only need to forward the + latest user message. """ messages = [i.model_dump() for i in request.input] # Normalize assistant message content from string to structured list format. @@ -207,7 +210,10 @@ async def deduplicate_input(request: ResponsesAgentRequest, session: AsyncDatabr ): msg["content"] = [{"type": "output_text", "text": msg["content"], "annotations": []}] session_items = await session.get_items() - if len(session_items) >= len(messages) - 1: + # Trust the session as authoritative for prior turns. Forward only the + # latest message (the new user turn). The Runner will prepend session + # history on the LLM call automatically. + if session_items and len(messages) > 1: return [messages[-1]] return messages