From 764f40852c51f4ee38d928e3194a6233d50ed356 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Tue, 31 Mar 2026 18:42:50 +0000 Subject: [PATCH 1/7] feat(container): add drainAll() to process-manager for graceful container eviction Implements 4-phase drain sequence: 1. Notify TownDO via POST /container-eviction 2. Nudge running polecat/refinery agents via sendMessage() 3. Poll up to 10 min waiting for agents to finish 4. Force-save stragglers with WIP git commit + push via Bun.spawn --- .../container/src/process-manager.ts | 124 ++++++++++++++++++ 1 file changed, 124 insertions(+) diff --git a/cloudflare-gastown/container/src/process-manager.ts b/cloudflare-gastown/container/src/process-manager.ts index f0008e320f..23c796d6df 100644 --- a/cloudflare-gastown/container/src/process-manager.ts +++ b/cloudflare-gastown/container/src/process-manager.ts @@ -973,6 +973,130 @@ export function activeServerCount(): number { return sdkInstances.size; } +/** + * Gracefully drain all running agents before container eviction. + * + * 4-phase sequence: + * 1. Notify TownDO of the eviction (fire-and-forget) + * 2. Nudge running polecats/refineries to commit & push + * 3. Poll up to 10 min waiting for agents to finish + * 4. Force-save any stragglers via WIP git commit + push + * + * Never throws — all errors are logged and swallowed so the caller + * can always proceed to stopAll() + process.exit(). + */ +export async function drainAll(): Promise { + const DRAIN_LOG = '[drain]'; + + // ── Phase 1: Notify TownDO ────────────────────────────────────────── + try { + const apiUrl = process.env.GASTOWN_API_URL; + const token = process.env.GASTOWN_CONTAINER_TOKEN; + // Grab townId from any registered agent — all agents in a container + // belong to the same town. + const anyAgent = [...agents.values()][0]; + const townId = anyAgent?.townId; + + if (apiUrl && token && townId) { + console.log(`${DRAIN_LOG} Phase 1: notifying TownDO of container eviction`); + const resp = await fetch(`${apiUrl}/api/towns/${townId}/container-eviction`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + Authorization: `Bearer ${token}`, + }, + }); + console.log(`${DRAIN_LOG} Phase 1: TownDO responded ${resp.status}`); + } else { + console.warn( + `${DRAIN_LOG} Phase 1: skipping TownDO notification (missing apiUrl=${!!apiUrl} token=${!!token} townId=${!!townId})` + ); + } + } catch (err) { + console.warn(`${DRAIN_LOG} Phase 1: TownDO notification failed, continuing:`, err); + } + + // ── Phase 2: Nudge running agents to save ─────────────────────────── + const runningAgents = [...agents.values()].filter(a => a.status === 'running'); + console.log(`${DRAIN_LOG} Phase 2: nudging ${runningAgents.length} running agents`); + + for (const agent of runningAgents) { + try { + let nudgeMessage: string | null = null; + + if (agent.role === 'polecat') { + nudgeMessage = + 'URGENT: The container is shutting down in ~15 minutes. Please commit and push your current changes immediately, then call gt_done. You have 2 minutes before a forced save.'; + } else if (agent.role === 'refinery') { + nudgeMessage = + 'URGENT: The container is shutting down. If your review is complete, call gt_done now. Otherwise your work will be pushed as a WIP commit.'; + } + // Mayor and other roles: no nudge needed + + if (nudgeMessage) { + console.log(`${DRAIN_LOG} Phase 2: nudging ${agent.role} agent ${agent.agentId}`); + await sendMessage(agent.agentId, nudgeMessage); + } + } catch (err) { + console.warn( + `${DRAIN_LOG} Phase 2: failed to nudge agent ${agent.agentId} (${agent.role}):`, + err + ); + } + } + + // ── Phase 3: Wait up to 10 minutes ────────────────────────────────── + const DRAIN_WAIT_MS = 10 * 60 * 1000; + const pollInterval = 5000; + const start = Date.now(); + console.log(`${DRAIN_LOG} Phase 3: waiting up to ${DRAIN_WAIT_MS / 1000}s for agents to finish`); + + while (Date.now() - start < DRAIN_WAIT_MS) { + const running = [...agents.values()].filter(a => a.status === 'running'); + if (running.length === 0) break; + console.log(`${DRAIN_LOG} Waiting for ${running.length} agents...`); + await new Promise(r => setTimeout(r, pollInterval)); + } + + // ── Phase 4: Force-save remaining agents ──────────────────────────── + const stragglers = [...agents.values()].filter(a => a.status === 'running'); + if (stragglers.length > 0) { + console.log(`${DRAIN_LOG} Phase 4: force-saving ${stragglers.length} straggler(s)`); + } else { + console.log(`${DRAIN_LOG} Phase 4: all agents finished, no force-save needed`); + } + + for (const agent of stragglers) { + try { + console.log(`${DRAIN_LOG} Phase 4: force-saving agent ${agent.agentId} in ${agent.workdir}`); + const proc = Bun.spawn( + [ + 'bash', + '-c', + "git add -A && git commit --allow-empty -m 'WIP: container eviction save' && git push --no-verify", + ], + { + cwd: agent.workdir, + stdout: 'pipe', + stderr: 'pipe', + } + ); + const exitCode = await proc.exited; + const stdout = await new Response(proc.stdout).text(); + const stderr = await new Response(proc.stderr).text(); + console.log( + `${DRAIN_LOG} Phase 4: agent ${agent.agentId} git save exited ${exitCode}` + + (stdout ? ` stdout=${stdout.trim()}` : '') + + (stderr ? ` stderr=${stderr.trim()}` : '') + ); + } catch (err) { + console.warn(`${DRAIN_LOG} Phase 4: force-save failed for agent ${agent.agentId}:`, err); + } + } + + console.log(`${DRAIN_LOG} Drain complete`); +} + export async function stopAll(): Promise { // Cancel all idle timers for (const [, timer] of idleTimers) { From f639f799ad55c9ce0665cac3573ca4c6e6ca8d19 Mon Sep 17 00:00:00 2001 From: Pedro Heyerdahl <61753986+pedroheyerdahl@users.noreply.github.com> Date: Tue, 31 Mar 2026 15:48:23 -0300 Subject: [PATCH 2/7] fix(tracking): only resolve signup product when callbackPath was explicitly provided (#1793) * fix(tracking): only resolve signup product when callbackPath was explicitly provided * style: apply oxfmt formatting to after-sign-in route --- src/app/users/after-sign-in/route.tsx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/app/users/after-sign-in/route.tsx b/src/app/users/after-sign-in/route.tsx index a241a2912a..b898fb61be 100644 --- a/src/app/users/after-sign-in/route.tsx +++ b/src/app/users/after-sign-in/route.tsx @@ -49,7 +49,10 @@ export async function GET(request: NextRequest) { // callbackPath query param, so the value cannot be user-tampered. This // runs exactly once per signup because has_validation_stytch is set // after account verification completes. - const product = resolveSignupProduct(responsePath, !!url.searchParams.get('source')); + const product = resolveSignupProduct( + callbackPath && isValidCallbackPath(callbackPath) ? responsePath : null, + !!url.searchParams.get('source') + ); if (product) { PostHogClient().capture({ distinctId: user.google_user_email, From ee84451b7ee67657abf229236f84680df3e90c81 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Tue, 31 Mar 2026 19:29:30 +0000 Subject: [PATCH 3/7] fix(drain): address PR review comments on drainAll() - Phase 1: wrap TownDO fetch in 10s abortable timeout so a hung endpoint cannot stall the entire drain sequence - Phase 2: clear idle timers before sending eviction nudge so a pre-existing timer cannot race and flip the agent to exited - Phase 4: abort SDK sessions before force-saving so git operations don't race with a still-running agent's worktree writes --- .../container/src/process-manager.ts | 56 ++++++++++++++++--- 1 file changed, 48 insertions(+), 8 deletions(-) diff --git a/cloudflare-gastown/container/src/process-manager.ts b/cloudflare-gastown/container/src/process-manager.ts index 23c796d6df..3217f0de87 100644 --- a/cloudflare-gastown/container/src/process-manager.ts +++ b/cloudflare-gastown/container/src/process-manager.ts @@ -999,14 +999,22 @@ export async function drainAll(): Promise { if (apiUrl && token && townId) { console.log(`${DRAIN_LOG} Phase 1: notifying TownDO of container eviction`); - const resp = await fetch(`${apiUrl}/api/towns/${townId}/container-eviction`, { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - Authorization: `Bearer ${token}`, - }, - }); - console.log(`${DRAIN_LOG} Phase 1: TownDO responded ${resp.status}`); + const NOTIFY_TIMEOUT_MS = 10_000; + const abortCtrl = new AbortController(); + const timer = setTimeout(() => abortCtrl.abort(), NOTIFY_TIMEOUT_MS); + try { + const resp = await fetch(`${apiUrl}/api/towns/${townId}/container-eviction`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + Authorization: `Bearer ${token}`, + }, + signal: abortCtrl.signal, + }); + console.log(`${DRAIN_LOG} Phase 1: TownDO responded ${resp.status}`); + } finally { + clearTimeout(timer); + } } else { console.warn( `${DRAIN_LOG} Phase 1: skipping TownDO notification (missing apiUrl=${!!apiUrl} token=${!!token} townId=${!!townId})` @@ -1034,6 +1042,9 @@ export async function drainAll(): Promise { // Mayor and other roles: no nudge needed if (nudgeMessage) { + // Clear any existing idle timer so it doesn't fire mid-nudge and + // flip the agent to 'exited' before the eviction prompt completes. + clearIdleTimer(agent.agentId); console.log(`${DRAIN_LOG} Phase 2: nudging ${agent.role} agent ${agent.agentId}`); await sendMessage(agent.agentId, nudgeMessage); } @@ -1066,6 +1077,35 @@ export async function drainAll(): Promise { console.log(`${DRAIN_LOG} Phase 4: all agents finished, no force-save needed`); } + // Abort all straggler sessions first so they stop writing to the worktree. + // This prevents races with git (index.lock collisions, missed late writes, + // concurrent git commands from the agent). + for (const agent of stragglers) { + try { + console.log(`${DRAIN_LOG} Phase 4: aborting session for agent ${agent.agentId}`); + // Abort event subscription so handleIdleEvent won't fire + const evtCtrl = eventAbortControllers.get(agent.agentId); + if (evtCtrl) evtCtrl.abort(); + + // Abort the SDK session to stop the LLM turn / tool execution + const instance = sdkInstances.get(agent.workdir); + if (instance) { + await instance.client.session.abort({ path: { id: agent.sessionId } }); + } + agent.status = 'exited'; + agent.exitReason = 'container eviction'; + } catch (err) { + console.warn( + `${DRAIN_LOG} Phase 4: failed to abort session for agent ${agent.agentId}:`, + err + ); + // Mark exited anyway — best-effort freeze before snapshotting + agent.status = 'exited'; + agent.exitReason = 'container eviction (abort failed)'; + } + } + + // Now that all sessions are frozen, snapshot each worktree for (const agent of stragglers) { try { console.log(`${DRAIN_LOG} Phase 4: force-saving agent ${agent.agentId} in ${agent.workdir}`); From 481196ea80e46717854ca65d6735ac261261fae2 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Tue, 31 Mar 2026 20:13:47 +0000 Subject: [PATCH 4/7] fix(container): clear idle timers and abort sessions before Phase 4 git save Phase 4 (force-save stragglers) now runs in two sub-steps: 1. Freeze: cancel idle timers, abort event subscriptions, abort SDK sessions, and mark agents as exited. This prevents the normal completion path (idle timer -> onExit -> bead completion) from racing with the WIP snapshot. 2. Snapshot: git add/commit/push each worktree after all sessions are frozen, avoiding .git/index.lock collisions. Also adds AbortSignal.timeout(10s) to Phase 1 TownDO notification and clearIdleTimer before Phase 2 nudge messages. --- .../container/src/process-manager.ts | 75 ++++++++++--------- 1 file changed, 39 insertions(+), 36 deletions(-) diff --git a/cloudflare-gastown/container/src/process-manager.ts b/cloudflare-gastown/container/src/process-manager.ts index 3217f0de87..6e33b7a04e 100644 --- a/cloudflare-gastown/container/src/process-manager.ts +++ b/cloudflare-gastown/container/src/process-manager.ts @@ -999,22 +999,15 @@ export async function drainAll(): Promise { if (apiUrl && token && townId) { console.log(`${DRAIN_LOG} Phase 1: notifying TownDO of container eviction`); - const NOTIFY_TIMEOUT_MS = 10_000; - const abortCtrl = new AbortController(); - const timer = setTimeout(() => abortCtrl.abort(), NOTIFY_TIMEOUT_MS); - try { - const resp = await fetch(`${apiUrl}/api/towns/${townId}/container-eviction`, { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - Authorization: `Bearer ${token}`, - }, - signal: abortCtrl.signal, - }); - console.log(`${DRAIN_LOG} Phase 1: TownDO responded ${resp.status}`); - } finally { - clearTimeout(timer); - } + const resp = await fetch(`${apiUrl}/api/towns/${townId}/container-eviction`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + Authorization: `Bearer ${token}`, + }, + signal: AbortSignal.timeout(10_000), + }); + console.log(`${DRAIN_LOG} Phase 1: TownDO responded ${resp.status}`); } else { console.warn( `${DRAIN_LOG} Phase 1: skipping TownDO notification (missing apiUrl=${!!apiUrl} token=${!!token} townId=${!!townId})` @@ -1042,8 +1035,9 @@ export async function drainAll(): Promise { // Mayor and other roles: no nudge needed if (nudgeMessage) { - // Clear any existing idle timer so it doesn't fire mid-nudge and - // flip the agent to 'exited' before the eviction prompt completes. + // Cancel the idle timer before nudging — if the agent was + // already idle, the timer could fire mid-nudge and exit the + // agent before it processes the eviction message. clearIdleTimer(agent.agentId); console.log(`${DRAIN_LOG} Phase 2: nudging ${agent.role} agent ${agent.agentId}`); await sendMessage(agent.agentId, nudgeMessage); @@ -1070,42 +1064,51 @@ export async function drainAll(): Promise { } // ── Phase 4: Force-save remaining agents ──────────────────────────── + // Two sub-steps: first freeze all stragglers (cancel idle timers, + // abort event subscriptions and SDK sessions), then snapshot each + // worktree. Freezing first prevents the normal completion path + // (idle timer → onExit → bead completion) from racing with the WIP + // git save, and avoids .git/index.lock collisions with agent git ops. const stragglers = [...agents.values()].filter(a => a.status === 'running'); if (stragglers.length > 0) { - console.log(`${DRAIN_LOG} Phase 4: force-saving ${stragglers.length} straggler(s)`); + console.log(`${DRAIN_LOG} Phase 4: freezing ${stragglers.length} straggler(s)`); } else { console.log(`${DRAIN_LOG} Phase 4: all agents finished, no force-save needed`); } - // Abort all straggler sessions first so they stop writing to the worktree. - // This prevents races with git (index.lock collisions, missed late writes, - // concurrent git commands from the agent). + // 4a: Freeze — cancel idle timers and abort sessions so no + // completion/exit callbacks can fire during the git snapshot. for (const agent of stragglers) { try { - console.log(`${DRAIN_LOG} Phase 4: aborting session for agent ${agent.agentId}`); - // Abort event subscription so handleIdleEvent won't fire - const evtCtrl = eventAbortControllers.get(agent.agentId); - if (evtCtrl) evtCtrl.abort(); + // Cancel idle timer FIRST — prevents the timer from firing and + // marking the agent as completed via onExit() while we abort. + clearIdleTimer(agent.agentId); + + // Abort event subscription + const controller = eventAbortControllers.get(agent.agentId); + if (controller) { + controller.abort(); + eventAbortControllers.delete(agent.agentId); + } - // Abort the SDK session to stop the LLM turn / tool execution + // Abort the SDK session const instance = sdkInstances.get(agent.workdir); if (instance) { - await instance.client.session.abort({ path: { id: agent.sessionId } }); + await instance.client.session.abort({ + path: { id: agent.sessionId }, + }); } + agent.status = 'exited'; agent.exitReason = 'container eviction'; + console.log(`${DRAIN_LOG} Phase 4: froze agent ${agent.agentId}`); } catch (err) { - console.warn( - `${DRAIN_LOG} Phase 4: failed to abort session for agent ${agent.agentId}:`, - err - ); - // Mark exited anyway — best-effort freeze before snapshotting - agent.status = 'exited'; - agent.exitReason = 'container eviction (abort failed)'; + console.warn(`${DRAIN_LOG} Phase 4: failed to freeze agent ${agent.agentId}:`, err); } } - // Now that all sessions are frozen, snapshot each worktree + // 4b: Snapshot — git add/commit/push each worktree now that + // all sessions are frozen. for (const agent of stragglers) { try { console.log(`${DRAIN_LOG} Phase 4: force-saving agent ${agent.agentId} in ${agent.workdir}`); From 05b59f583dd380b8b40e1a3a9c30e3c92e37bb39 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Tue, 31 Mar 2026 20:48:12 +0000 Subject: [PATCH 5/7] fix(container): skip WIP snapshot for agents whose freeze failed Build the 4b snapshot list from successfully frozen agents only. If session.abort() throws in 4a, the agent is excluded from snapshotting to avoid racing git against a still-active session. --- .../container/src/process-manager.ts | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/cloudflare-gastown/container/src/process-manager.ts b/cloudflare-gastown/container/src/process-manager.ts index 6e33b7a04e..ab48edb738 100644 --- a/cloudflare-gastown/container/src/process-manager.ts +++ b/cloudflare-gastown/container/src/process-manager.ts @@ -1078,6 +1078,8 @@ export async function drainAll(): Promise { // 4a: Freeze — cancel idle timers and abort sessions so no // completion/exit callbacks can fire during the git snapshot. + // Only agents that freeze successfully are safe to snapshot. + const frozen: typeof stragglers = []; for (const agent of stragglers) { try { // Cancel idle timer FIRST — prevents the timer from firing and @@ -1101,15 +1103,24 @@ export async function drainAll(): Promise { agent.status = 'exited'; agent.exitReason = 'container eviction'; + frozen.push(agent); console.log(`${DRAIN_LOG} Phase 4: froze agent ${agent.agentId}`); } catch (err) { - console.warn(`${DRAIN_LOG} Phase 4: failed to freeze agent ${agent.agentId}:`, err); + // Freeze failed — the session may still be writing to the + // worktree. Skip this agent in 4b to avoid .git/index.lock + // races and partial snapshots. + console.warn( + `${DRAIN_LOG} Phase 4: failed to freeze agent ${agent.agentId}, skipping snapshot:`, + err + ); } } // 4b: Snapshot — git add/commit/push each worktree now that - // all sessions are frozen. - for (const agent of stragglers) { + // all sessions are frozen. Only iterate agents that froze + // successfully; unfrozen agents are skipped to avoid racing + // with a still-active SDK session. + for (const agent of frozen) { try { console.log(`${DRAIN_LOG} Phase 4: force-saving agent ${agent.agentId} in ${agent.workdir}`); const proc = Bun.spawn( From 6450395e42b242c959d2314f3a9d9f6efc60b1c8 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Tue, 31 Mar 2026 21:32:54 +0000 Subject: [PATCH 6/7] fix(container): set upstream on eviction push for first-time branches Use git push --set-upstream origin HEAD so branches without a configured upstream (e.g. freshly created agent worktrees) can push the WIP eviction snapshot instead of failing silently. --- cloudflare-gastown/container/src/process-manager.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloudflare-gastown/container/src/process-manager.ts b/cloudflare-gastown/container/src/process-manager.ts index ab48edb738..cecd137e67 100644 --- a/cloudflare-gastown/container/src/process-manager.ts +++ b/cloudflare-gastown/container/src/process-manager.ts @@ -1127,7 +1127,7 @@ export async function drainAll(): Promise { [ 'bash', '-c', - "git add -A && git commit --allow-empty -m 'WIP: container eviction save' && git push --no-verify", + "git add -A && git commit --allow-empty -m 'WIP: container eviction save' && git push --set-upstream origin HEAD --no-verify", ], { cwd: agent.workdir, From 00b8599cb25b48ee3aa2fce9d506c2711d5b37cc Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Wed, 1 Apr 2026 00:42:03 +0000 Subject: [PATCH 7/7] fix(container): skip git push for lightweight workspaces with no origin remote Phase 4b of drainAll() assumed all workspaces have an 'origin' remote. Lightweight workspaces (mayor/triage) are initialized with 'git init' and never add a remote, causing 'fatal: origin does not appear to be a git repository' during eviction. Check for the origin remote via 'git remote get-url origin' before attempting to push. When no remote exists, commit locally only and log a warning. --- .../container/src/process-manager.ts | 39 +++++++++++++------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/cloudflare-gastown/container/src/process-manager.ts b/cloudflare-gastown/container/src/process-manager.ts index cecd137e67..eca69e6316 100644 --- a/cloudflare-gastown/container/src/process-manager.ts +++ b/cloudflare-gastown/container/src/process-manager.ts @@ -1123,18 +1123,33 @@ export async function drainAll(): Promise { for (const agent of frozen) { try { console.log(`${DRAIN_LOG} Phase 4: force-saving agent ${agent.agentId} in ${agent.workdir}`); - const proc = Bun.spawn( - [ - 'bash', - '-c', - "git add -A && git commit --allow-empty -m 'WIP: container eviction save' && git push --set-upstream origin HEAD --no-verify", - ], - { - cwd: agent.workdir, - stdout: 'pipe', - stderr: 'pipe', - } - ); + + // Check whether a remote named "origin" exists. Lightweight + // workspaces (mayor/triage) are created with `git init` and + // never add a remote, so pushing would fail with + // "fatal: 'origin' does not appear to be a git repository". + const remoteCheck = Bun.spawn(['git', 'remote', 'get-url', 'origin'], { + cwd: agent.workdir, + stdout: 'pipe', + stderr: 'pipe', + }); + const hasOrigin = (await remoteCheck.exited) === 0; + + const gitCmd = hasOrigin + ? "git add -A && git commit --allow-empty -m 'WIP: container eviction save' && git push --set-upstream origin HEAD --no-verify" + : "git add -A && git commit --allow-empty -m 'WIP: container eviction save'"; + + if (!hasOrigin) { + console.warn( + `${DRAIN_LOG} Phase 4: no origin remote for agent ${agent.agentId}, committing locally only (push skipped)` + ); + } + + const proc = Bun.spawn(['bash', '-c', gitCmd], { + cwd: agent.workdir, + stdout: 'pipe', + stderr: 'pipe', + }); const exitCode = await proc.exited; const stdout = await new Response(proc.stdout).text(); const stderr = await new Response(proc.stderr).text();