diff --git a/services/gastown/container/src/main.ts b/services/gastown/container/src/main.ts index 5b12a1aa1..e351a9173 100644 --- a/services/gastown/container/src/main.ts +++ b/services/gastown/container/src/main.ts @@ -1,11 +1,54 @@ import { startControlServer } from './control-server'; import { log } from './logger'; -import { bootHydration, getUptime } from './process-manager'; +import { activeAgentCount, bootHydration, getUptime, listAgents } from './process-manager'; -log.info('container.cold_start', { uptime: getUptime(), ts: new Date().toISOString() }); +// Container-scoped identifiers for crash/diagnostic logs. The container is +// pinned to a single town for its lifetime (see GASTOWN_TOWN_ID injection in +// the deployer), so reading these once at module init is safe and lets us +// emit them even when no agents are registered yet. +const TOWN_ID = process.env.GASTOWN_TOWN_ID ?? null; + +log.info('container.cold_start', { + uptime: getUptime(), + ts: new Date().toISOString(), + townId: TOWN_ID, +}); + +// Bun (like Node) will ignore unhandled promise rejections unless a handler +// is registered. Without this handler a rejection in a fire-and-forget path +// (e.g. `void saveDbSnapshot(...)`, `void subscribeToEvents(...)`, +// `setInterval(() => void fn())`) is effectively invisible — making the +// root cause of container crashes impossible to diagnose from logs. +// +// We deliberately DO NOT call process.exit here: visibility is the goal. +// If a specific rejection turns out to be fatal state corruption we can +// escalate it individually. +process.on('unhandledRejection', (reason, promise) => { + const err = + reason instanceof Error + ? { message: reason.message, stack: reason.stack, name: reason.name } + : { message: String(reason) }; + log.error('container.unhandled_rejection', { + ...err, + townId: TOWN_ID, + uptimeMs: getUptime(), + activeAgents: activeAgentCount(), + promise: String(promise), + }); +}); process.on('uncaughtException', err => { - log.error('container.uncaught_exception', { error: err.message, stack: err.stack }); + log.error('container.uncaught_exception', { + message: err.message, + stack: err.stack, + name: err.name, + townId: TOWN_ID, + uptimeMs: getUptime(), + activeAgents: activeAgentCount(), + }); + // Keep the existing fatal behaviour for truly uncaught synchronous errors. + // An unhandled rejection is handled separately above without exit so we + // can observe the crash class before deciding whether to remain fatal. process.exit(1); }); @@ -13,8 +56,45 @@ process.on('SIGTERM', () => { console.log('SIGTERM received — starting graceful drain...'); }); +// Periodically log RSS memory so we can correlate OOM-class failures +// (external SIGKILL from Cloudflare Containers runtime when a memory +// ceiling is hit) with steady-state memory growth. 30s cadence matches +// the heartbeat interval and is cheap. +const MEMORY_LOG_INTERVAL_MS = 30_000; +setInterval(() => { + try { + const mem = process.memoryUsage(); + log.info('container.memory_usage', { + rssMB: Math.round(mem.rss / 1024 / 1024), + heapUsedMB: Math.round(mem.heapUsed / 1024 / 1024), + heapTotalMB: Math.round(mem.heapTotal / 1024 / 1024), + externalMB: Math.round(mem.external / 1024 / 1024), + townId: TOWN_ID, + uptimeMs: getUptime(), + agents: listAgents().length, + activeAgents: activeAgentCount(), + }); + } catch (err) { + log.warn('container.memory_usage_failed', { + error: err instanceof Error ? err.message : String(err), + }); + } +}, MEMORY_LOG_INTERVAL_MS); + startControlServer(); void (async () => { - await bootHydration(); + try { + await bootHydration(); + } catch (err) { + // bootHydration has its own try/catch for the registry fetch path but + // the inner startAgent loop can still throw on rare synchronous errors + // before its first await. Log rather than crash so the next /agents/start + // request can recover. + log.error('container.boot_hydration_failed', { + message: err instanceof Error ? err.message : String(err), + stack: err instanceof Error ? err.stack : undefined, + townId: TOWN_ID, + }); + } })(); diff --git a/services/gastown/container/src/process-manager.test.ts b/services/gastown/container/src/process-manager.test.ts index 91ffdcce4..d2c8906c2 100644 --- a/services/gastown/container/src/process-manager.test.ts +++ b/services/gastown/container/src/process-manager.test.ts @@ -24,7 +24,7 @@ vi.mock('./token-refresh', () => ({ refreshTokenIfNearExpiry: vi.fn(), })); -const { applyModelToSession } = await import('./process-manager'); +const { applyModelToSession, withStartAgentLock } = await import('./process-manager'); type PromptCall = { path: { id: string }; @@ -117,3 +117,64 @@ describe('applyModelToSession', () => { ).rejects.toThrow('simulated SDK failure'); }); }); + +describe('withStartAgentLock', () => { + it('serialises concurrent callers for the same agentId', async () => { + const order: string[] = []; + let secondStartedBeforeFirstFinished = false; + + // Fire both in the same microtask so they race on the lock. + const first = withStartAgentLock('agent-1', async () => { + order.push('first:start'); + await new Promise(r => setTimeout(r, 20)); + order.push('first:end'); + return 1; + }); + const second = withStartAgentLock('agent-1', async () => { + // If the lock works, `first:end` has already been pushed. + if (!order.includes('first:end')) { + secondStartedBeforeFirstFinished = true; + } + order.push('second:start'); + order.push('second:end'); + return 2; + }); + + const [r1, r2] = await Promise.all([first, second]); + expect(r1).toBe(1); + expect(r2).toBe(2); + expect(secondStartedBeforeFirstFinished).toBe(false); + expect(order).toEqual(['first:start', 'first:end', 'second:start', 'second:end']); + }); + + it('runs concurrently for different agentIds', async () => { + const order: string[] = []; + + const a = withStartAgentLock('agent-a', async () => { + order.push('a:start'); + await new Promise(r => setTimeout(r, 20)); + order.push('a:end'); + }); + const b = withStartAgentLock('agent-b', async () => { + order.push('b:start'); + await new Promise(r => setTimeout(r, 20)); + order.push('b:end'); + }); + + await Promise.all([a, b]); + + // Both should have started before either ended (no serialisation across ids). + expect(order.indexOf('b:start')).toBeLessThan(order.indexOf('a:end')); + }); + + it('releases the lock when the fn throws so subsequent callers can proceed', async () => { + await expect( + withStartAgentLock('agent-err', async () => { + throw new Error('boom'); + }) + ).rejects.toThrow('boom'); + + const result = await withStartAgentLock('agent-err', async () => 'ok'); + expect(result).toBe('ok'); + }); +}); diff --git a/services/gastown/container/src/process-manager.ts b/services/gastown/container/src/process-manager.ts index 33a37fa29..02eb63a0f 100644 --- a/services/gastown/container/src/process-manager.ts +++ b/services/gastown/container/src/process-manager.ts @@ -71,6 +71,47 @@ export function isDraining(): boolean { // once created, the SDK instance is reused without locking. let sdkServerLock: Promise = Promise.resolve(); +// Per-agentId mutex for startAgent. Without this, two concurrent POST +// /agents/start calls for the same agentId (observed in production: two +// `[control-server] /agents/start:` log lines at the same millisecond) +// both pass the re-entrancy check at the top of startAgent before either +// has committed a 'starting' record. The second invocation aborts the +// first's startupAbortController and both paths race on session creation, +// idle timers, and SDK instance reference counts — leaving the agent in +// an inconsistent state (orphaned sessions, leaked sessionCount, etc). +// +// Serialising per agentId means the second caller waits for the first to +// complete (or abort) before proceeding, and then observes a consistent +// snapshot in `agents.get(agentId)`. +const startAgentLocks = new Map>(); + +// Exported for tests that exercise the locking behaviour directly without +// bringing up the whole SDK/process harness. Production callers should use +// `startAgent` (which wraps `startAgentImpl` with this lock). +export async function withStartAgentLock(agentId: string, fn: () => Promise): Promise { + const previous = startAgentLocks.get(agentId) ?? Promise.resolve(); + // Use the same explicit `new Promise` pattern as `sdkServerLock` above + // instead of `Promise.withResolvers`, which is not available on older + // Bun runtimes. This module is imported during container startup, so a + // missing global here would throw before the crash handlers are + // registered and prevent the control server from starting. + let releaseLock!: () => void; + const lockPromise = new Promise(resolve => { + releaseLock = resolve; + }); + startAgentLocks.set(agentId, lockPromise); + try { + await previous.catch(() => {}); + return await fn(); + } finally { + releaseLock(); + // Only clear the slot if no newer caller has queued behind us. + if (startAgentLocks.get(agentId) === lockPromise) { + startAgentLocks.delete(agentId); + } + } +} + export function getUptime(): number { return Date.now() - startTime; } @@ -1005,11 +1046,22 @@ async function subscribeToEvents( /** * Start an agent: ensure SDK server, create session, subscribe to events, * send initial prompt. + * + * Serialises concurrent callers for the same agentId so the re-entrancy + * handling inside `startAgentImpl` observes a consistent snapshot. */ export async function startAgent( request: StartAgentRequest, workdir: string, env: Record +): Promise { + return withStartAgentLock(request.agentId, () => startAgentImpl(request, workdir, env)); +} + +async function startAgentImpl( + request: StartAgentRequest, + workdir: string, + env: Record ): Promise { const existing = agents.get(request.agentId); if (existing && (existing.status === 'running' || existing.status === 'starting')) {