From 8d7578a87f73c9749a6c86c07e7350bd12db94b7 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Tue, 5 May 2026 20:05:38 +0000 Subject: [PATCH 1/3] feat(gastown-container): add crash visibility + per-agent start mutex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Diagnostic changes to investigate frequent container restarts for town 4d82f099-ccb7-4eaf-8676-73562e0a27eb (~1.5–2 min boot-hydration loops). - main.ts: add unhandledRejection listener that logs full error/stack without exiting (Bun/Node silently drop rejections without a handler, making fire-and-forget failures like void saveDbSnapshot()/void subscribeToEvents() invisible). Include uptime and active-agent count for correlation. - main.ts: improve uncaughtException log with name/uptime/agent count. - main.ts: 30s periodic container.memory_usage log (rss/heap/external) so OOM-class failures (external SIGKILL from Cloudflare Containers runtime when the memory ceiling is hit) become observable — these leave no exception behind. - main.ts: wrap bootHydration() in try/catch so a rare synchronous throw before the first await doesn't crash the process. - process-manager.ts: add per-agentId mutex for startAgent. Production logs show two /agents/start requests for the same agentId logged at the same millisecond; both pass the re-entrancy check before either commits a 'starting' record, then race on startupAbortController, session creation, idle timers, and SDK sessionCount. Serialising per agentId makes the re-entrant path observe a consistent snapshot. - process-manager.test.ts: three tests for the mutex — same-id serialisation, different-id concurrency, lock release on throw. --- services/gastown/container/src/main.ts | 72 ++++++++++++++++++- .../container/src/process-manager.test.ts | 63 +++++++++++++++- .../gastown/container/src/process-manager.ts | 44 ++++++++++++ 3 files changed, 175 insertions(+), 4 deletions(-) diff --git a/services/gastown/container/src/main.ts b/services/gastown/container/src/main.ts index 5b12a1aa19..4b16f64304 100644 --- a/services/gastown/container/src/main.ts +++ b/services/gastown/container/src/main.ts @@ -1,11 +1,42 @@ import { startControlServer } from './control-server'; import { log } from './logger'; -import { bootHydration, getUptime } from './process-manager'; +import { activeAgentCount, bootHydration, getUptime, listAgents } from './process-manager'; log.info('container.cold_start', { uptime: getUptime(), ts: new Date().toISOString() }); +// Bun (like Node) will ignore unhandled promise rejections unless a handler +// is registered. Without this handler a rejection in a fire-and-forget path +// (e.g. `void saveDbSnapshot(...)`, `void subscribeToEvents(...)`, +// `setInterval(() => void fn())`) is effectively invisible — making the +// root cause of container crashes impossible to diagnose from logs. +// +// We deliberately DO NOT call process.exit here: visibility is the goal. +// If a specific rejection turns out to be fatal state corruption we can +// escalate it individually. +process.on('unhandledRejection', (reason, promise) => { + const err = + reason instanceof Error + ? { message: reason.message, stack: reason.stack, name: reason.name } + : { message: String(reason) }; + log.error('container.unhandled_rejection', { + ...err, + uptimeMs: getUptime(), + activeAgents: activeAgentCount(), + promise: String(promise), + }); +}); + process.on('uncaughtException', err => { - log.error('container.uncaught_exception', { error: err.message, stack: err.stack }); + log.error('container.uncaught_exception', { + message: err.message, + stack: err.stack, + name: err.name, + uptimeMs: getUptime(), + activeAgents: activeAgentCount(), + }); + // Keep the existing fatal behaviour for truly uncaught synchronous errors. + // An unhandled rejection is handled separately above without exit so we + // can observe the crash class before deciding whether to remain fatal. process.exit(1); }); @@ -13,8 +44,43 @@ process.on('SIGTERM', () => { console.log('SIGTERM received — starting graceful drain...'); }); +// Periodically log RSS memory so we can correlate OOM-class failures +// (external SIGKILL from Cloudflare Containers runtime when a memory +// ceiling is hit) with steady-state memory growth. 30s cadence matches +// the heartbeat interval and is cheap. +const MEMORY_LOG_INTERVAL_MS = 30_000; +setInterval(() => { + try { + const mem = process.memoryUsage(); + log.info('container.memory_usage', { + rssMB: Math.round(mem.rss / 1024 / 1024), + heapUsedMB: Math.round(mem.heapUsed / 1024 / 1024), + heapTotalMB: Math.round(mem.heapTotal / 1024 / 1024), + externalMB: Math.round(mem.external / 1024 / 1024), + uptimeMs: getUptime(), + agents: listAgents().length, + activeAgents: activeAgentCount(), + }); + } catch (err) { + log.warn('container.memory_usage_failed', { + error: err instanceof Error ? err.message : String(err), + }); + } +}, MEMORY_LOG_INTERVAL_MS); + startControlServer(); void (async () => { - await bootHydration(); + try { + await bootHydration(); + } catch (err) { + // bootHydration has its own try/catch for the registry fetch path but + // the inner startAgent loop can still throw on rare synchronous errors + // before its first await. Log rather than crash so the next /agents/start + // request can recover. + log.error('container.boot_hydration_failed', { + message: err instanceof Error ? err.message : String(err), + stack: err instanceof Error ? err.stack : undefined, + }); + } })(); diff --git a/services/gastown/container/src/process-manager.test.ts b/services/gastown/container/src/process-manager.test.ts index 91ffdcce4c..d2c8906c2a 100644 --- a/services/gastown/container/src/process-manager.test.ts +++ b/services/gastown/container/src/process-manager.test.ts @@ -24,7 +24,7 @@ vi.mock('./token-refresh', () => ({ refreshTokenIfNearExpiry: vi.fn(), })); -const { applyModelToSession } = await import('./process-manager'); +const { applyModelToSession, withStartAgentLock } = await import('./process-manager'); type PromptCall = { path: { id: string }; @@ -117,3 +117,64 @@ describe('applyModelToSession', () => { ).rejects.toThrow('simulated SDK failure'); }); }); + +describe('withStartAgentLock', () => { + it('serialises concurrent callers for the same agentId', async () => { + const order: string[] = []; + let secondStartedBeforeFirstFinished = false; + + // Fire both in the same microtask so they race on the lock. + const first = withStartAgentLock('agent-1', async () => { + order.push('first:start'); + await new Promise(r => setTimeout(r, 20)); + order.push('first:end'); + return 1; + }); + const second = withStartAgentLock('agent-1', async () => { + // If the lock works, `first:end` has already been pushed. + if (!order.includes('first:end')) { + secondStartedBeforeFirstFinished = true; + } + order.push('second:start'); + order.push('second:end'); + return 2; + }); + + const [r1, r2] = await Promise.all([first, second]); + expect(r1).toBe(1); + expect(r2).toBe(2); + expect(secondStartedBeforeFirstFinished).toBe(false); + expect(order).toEqual(['first:start', 'first:end', 'second:start', 'second:end']); + }); + + it('runs concurrently for different agentIds', async () => { + const order: string[] = []; + + const a = withStartAgentLock('agent-a', async () => { + order.push('a:start'); + await new Promise(r => setTimeout(r, 20)); + order.push('a:end'); + }); + const b = withStartAgentLock('agent-b', async () => { + order.push('b:start'); + await new Promise(r => setTimeout(r, 20)); + order.push('b:end'); + }); + + await Promise.all([a, b]); + + // Both should have started before either ended (no serialisation across ids). + expect(order.indexOf('b:start')).toBeLessThan(order.indexOf('a:end')); + }); + + it('releases the lock when the fn throws so subsequent callers can proceed', async () => { + await expect( + withStartAgentLock('agent-err', async () => { + throw new Error('boom'); + }) + ).rejects.toThrow('boom'); + + const result = await withStartAgentLock('agent-err', async () => 'ok'); + expect(result).toBe('ok'); + }); +}); diff --git a/services/gastown/container/src/process-manager.ts b/services/gastown/container/src/process-manager.ts index 33a37fa29a..9d179edb33 100644 --- a/services/gastown/container/src/process-manager.ts +++ b/services/gastown/container/src/process-manager.ts @@ -71,6 +71,39 @@ export function isDraining(): boolean { // once created, the SDK instance is reused without locking. let sdkServerLock: Promise = Promise.resolve(); +// Per-agentId mutex for startAgent. Without this, two concurrent POST +// /agents/start calls for the same agentId (observed in production: two +// `[control-server] /agents/start:` log lines at the same millisecond) +// both pass the re-entrancy check at the top of startAgent before either +// has committed a 'starting' record. The second invocation aborts the +// first's startupAbortController and both paths race on session creation, +// idle timers, and SDK instance reference counts — leaving the agent in +// an inconsistent state (orphaned sessions, leaked sessionCount, etc). +// +// Serialising per agentId means the second caller waits for the first to +// complete (or abort) before proceeding, and then observes a consistent +// snapshot in `agents.get(agentId)`. +const startAgentLocks = new Map>(); + +// Exported for tests that exercise the locking behaviour directly without +// bringing up the whole SDK/process harness. Production callers should use +// `startAgent` (which wraps `startAgentImpl` with this lock). +export async function withStartAgentLock(agentId: string, fn: () => Promise): Promise { + const previous = startAgentLocks.get(agentId) ?? Promise.resolve(); + const deferred = Promise.withResolvers(); + startAgentLocks.set(agentId, deferred.promise); + try { + await previous.catch(() => {}); + return await fn(); + } finally { + deferred.resolve(); + // Only clear the slot if no newer caller has queued behind us. + if (startAgentLocks.get(agentId) === deferred.promise) { + startAgentLocks.delete(agentId); + } + } +} + export function getUptime(): number { return Date.now() - startTime; } @@ -1005,11 +1038,22 @@ async function subscribeToEvents( /** * Start an agent: ensure SDK server, create session, subscribe to events, * send initial prompt. + * + * Serialises concurrent callers for the same agentId so the re-entrancy + * handling inside `startAgentImpl` observes a consistent snapshot. */ export async function startAgent( request: StartAgentRequest, workdir: string, env: Record +): Promise { + return withStartAgentLock(request.agentId, () => startAgentImpl(request, workdir, env)); +} + +async function startAgentImpl( + request: StartAgentRequest, + workdir: string, + env: Record ): Promise { const existing = agents.get(request.agentId); if (existing && (existing.status === 'running' || existing.status === 'starting')) { From ff34e19c508e9fc8db3e44a60846b56b0ce0a61a Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Tue, 5 May 2026 20:10:17 +0000 Subject: [PATCH 2/3] fix(container): replace Promise.withResolvers with explicit new Promise Promise.withResolvers is a newer API not available on older Bun runtimes. Since process-manager.ts is imported during container startup, a missing global would throw before crash handlers are registered and prevent the control server from starting. Use the same explicit new Promise pattern as the existing sdkServerLock. --- .../gastown/container/src/process-manager.ts | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/services/gastown/container/src/process-manager.ts b/services/gastown/container/src/process-manager.ts index 9d179edb33..02eb63a0f7 100644 --- a/services/gastown/container/src/process-manager.ts +++ b/services/gastown/container/src/process-manager.ts @@ -90,15 +90,23 @@ const startAgentLocks = new Map>(); // `startAgent` (which wraps `startAgentImpl` with this lock). export async function withStartAgentLock(agentId: string, fn: () => Promise): Promise { const previous = startAgentLocks.get(agentId) ?? Promise.resolve(); - const deferred = Promise.withResolvers(); - startAgentLocks.set(agentId, deferred.promise); + // Use the same explicit `new Promise` pattern as `sdkServerLock` above + // instead of `Promise.withResolvers`, which is not available on older + // Bun runtimes. This module is imported during container startup, so a + // missing global here would throw before the crash handlers are + // registered and prevent the control server from starting. + let releaseLock!: () => void; + const lockPromise = new Promise(resolve => { + releaseLock = resolve; + }); + startAgentLocks.set(agentId, lockPromise); try { await previous.catch(() => {}); return await fn(); } finally { - deferred.resolve(); + releaseLock(); // Only clear the slot if no newer caller has queued behind us. - if (startAgentLocks.get(agentId) === deferred.promise) { + if (startAgentLocks.get(agentId) === lockPromise) { startAgentLocks.delete(agentId); } } From cd2f2a99d2ea69af547c7f1afeb9cfb50976f75e Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Tue, 5 May 2026 20:15:54 +0000 Subject: [PATCH 3/3] feat(gastown/container): include townId in crash and memory logs Per review feedback, attach the container's GASTOWN_TOWN_ID to unhandled_rejection, uncaught_exception, cold_start, memory_usage, and boot_hydration_failed log entries so production crash logs can be correlated with a specific town without needing to also have an agent registered. --- services/gastown/container/src/main.ts | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/services/gastown/container/src/main.ts b/services/gastown/container/src/main.ts index 4b16f64304..e351a91731 100644 --- a/services/gastown/container/src/main.ts +++ b/services/gastown/container/src/main.ts @@ -2,7 +2,17 @@ import { startControlServer } from './control-server'; import { log } from './logger'; import { activeAgentCount, bootHydration, getUptime, listAgents } from './process-manager'; -log.info('container.cold_start', { uptime: getUptime(), ts: new Date().toISOString() }); +// Container-scoped identifiers for crash/diagnostic logs. The container is +// pinned to a single town for its lifetime (see GASTOWN_TOWN_ID injection in +// the deployer), so reading these once at module init is safe and lets us +// emit them even when no agents are registered yet. +const TOWN_ID = process.env.GASTOWN_TOWN_ID ?? null; + +log.info('container.cold_start', { + uptime: getUptime(), + ts: new Date().toISOString(), + townId: TOWN_ID, +}); // Bun (like Node) will ignore unhandled promise rejections unless a handler // is registered. Without this handler a rejection in a fire-and-forget path @@ -20,6 +30,7 @@ process.on('unhandledRejection', (reason, promise) => { : { message: String(reason) }; log.error('container.unhandled_rejection', { ...err, + townId: TOWN_ID, uptimeMs: getUptime(), activeAgents: activeAgentCount(), promise: String(promise), @@ -31,6 +42,7 @@ process.on('uncaughtException', err => { message: err.message, stack: err.stack, name: err.name, + townId: TOWN_ID, uptimeMs: getUptime(), activeAgents: activeAgentCount(), }); @@ -57,6 +69,7 @@ setInterval(() => { heapUsedMB: Math.round(mem.heapUsed / 1024 / 1024), heapTotalMB: Math.round(mem.heapTotal / 1024 / 1024), externalMB: Math.round(mem.external / 1024 / 1024), + townId: TOWN_ID, uptimeMs: getUptime(), agents: listAgents().length, activeAgents: activeAgentCount(), @@ -81,6 +94,7 @@ void (async () => { log.error('container.boot_hydration_failed', { message: err instanceof Error ? err.message : String(err), stack: err instanceof Error ? err.stack : undefined, + townId: TOWN_ID, }); } })();