diff --git a/services/gastown/container/src/main.ts b/services/gastown/container/src/main.ts
index 5b12a1aa1..e351a9173 100644
--- a/services/gastown/container/src/main.ts
+++ b/services/gastown/container/src/main.ts
@@ -1,11 +1,54 @@
 import { startControlServer } from './control-server';
 import { log } from './logger';
-import { bootHydration, getUptime } from './process-manager';
+import { activeAgentCount, bootHydration, getUptime, listAgents } from './process-manager';
 
-log.info('container.cold_start', { uptime: getUptime(), ts: new Date().toISOString() });
+// Container-scoped identifiers for crash/diagnostic logs. The container is
+// pinned to a single town for its lifetime (see GASTOWN_TOWN_ID injection in
+// the deployer), so reading these once at module init is safe and lets us
+// emit them even when no agents are registered yet.
+const TOWN_ID = process.env.GASTOWN_TOWN_ID ?? null;
+
+log.info('container.cold_start', {
+  uptime: getUptime(),
+  ts: new Date().toISOString(),
+  townId: TOWN_ID,
+});
+
+// Bun (like Node) will ignore unhandled promise rejections unless a handler
+// is registered. Without this handler a rejection in a fire-and-forget path
+// (e.g. `void saveDbSnapshot(...)`, `void subscribeToEvents(...)`,
+// `setInterval(() => void fn())`) is effectively invisible — making the
+// root cause of container crashes impossible to diagnose from logs.
+//
+// We deliberately DO NOT call process.exit here: visibility is the goal.
+// If a specific rejection turns out to be fatal state corruption we can
+// escalate it individually.
+process.on('unhandledRejection', (reason, promise) => {
+  const err =
+    reason instanceof Error
+      ? { message: reason.message, stack: reason.stack, name: reason.name }
+      : { message: String(reason) };
+  log.error('container.unhandled_rejection', {
+    ...err,
+    townId: TOWN_ID,
+    uptimeMs: getUptime(),
+    activeAgents: activeAgentCount(),
+    promise: String(promise),
+  });
+});
 
 process.on('uncaughtException', err => {
-  log.error('container.uncaught_exception', { error: err.message, stack: err.stack });
+  log.error('container.uncaught_exception', {
+    message: err.message,
+    stack: err.stack,
+    name: err.name,
+    townId: TOWN_ID,
+    uptimeMs: getUptime(),
+    activeAgents: activeAgentCount(),
+  });
+  // Keep the existing fatal behaviour for truly uncaught synchronous errors.
+  // An unhandled rejection is handled separately above without exit so we
+  // can observe the crash class before deciding whether to remain fatal.
   process.exit(1);
 });
 
@@ -13,8 +56,45 @@ process.on('SIGTERM', () => {
   console.log('SIGTERM received — starting graceful drain...');
 });
 
+// Periodically log RSS memory so we can correlate OOM-class failures
+// (external SIGKILL from Cloudflare Containers runtime when a memory
+// ceiling is hit) with steady-state memory growth. 30s cadence matches
+// the heartbeat interval and is cheap.
+const MEMORY_LOG_INTERVAL_MS = 30_000;
+setInterval(() => {
+  try {
+    const mem = process.memoryUsage();
+    log.info('container.memory_usage', {
+      rssMB: Math.round(mem.rss / 1024 / 1024),
+      heapUsedMB: Math.round(mem.heapUsed / 1024 / 1024),
+      heapTotalMB: Math.round(mem.heapTotal / 1024 / 1024),
+      externalMB: Math.round(mem.external / 1024 / 1024),
+      townId: TOWN_ID,
+      uptimeMs: getUptime(),
+      agents: listAgents().length,
+      activeAgents: activeAgentCount(),
+    });
+  } catch (err) {
+    log.warn('container.memory_usage_failed', {
+      error: err instanceof Error ? err.message : String(err),
+    });
+  }
+}, MEMORY_LOG_INTERVAL_MS);
+
 startControlServer();
 
 void (async () => {
-  await bootHydration();
+  try {
+    await bootHydration();
+  } catch (err) {
+    // bootHydration has its own try/catch for the registry fetch path but
+    // the inner startAgent loop can still throw on rare synchronous errors
+    // before its first await. Log rather than crash so the next /agents/start
+    // request can recover.
+    log.error('container.boot_hydration_failed', {
+      message: err instanceof Error ? err.message : String(err),
+      stack: err instanceof Error ? err.stack : undefined,
+      townId: TOWN_ID,
+    });
+  }
 })();
diff --git a/services/gastown/container/src/process-manager.test.ts b/services/gastown/container/src/process-manager.test.ts
index 91ffdcce4..d2c8906c2 100644
--- a/services/gastown/container/src/process-manager.test.ts
+++ b/services/gastown/container/src/process-manager.test.ts
@@ -24,7 +24,7 @@ vi.mock('./token-refresh', () => ({
   refreshTokenIfNearExpiry: vi.fn(),
 }));
 
-const { applyModelToSession } = await import('./process-manager');
+const { applyModelToSession, withStartAgentLock } = await import('./process-manager');
 
 type PromptCall = {
   path: { id: string };
@@ -117,3 +117,64 @@ describe('applyModelToSession', () => {
     ).rejects.toThrow('simulated SDK failure');
   });
 });
+
+describe('withStartAgentLock', () => {
+  it('serialises concurrent callers for the same agentId', async () => {
+    const order: string[] = [];
+    let secondStartedBeforeFirstFinished = false;
+
+    // Fire both in the same microtask so they race on the lock.
+    const first = withStartAgentLock('agent-1', async () => {
+      order.push('first:start');
+      await new Promise(r => setTimeout(r, 20));
+      order.push('first:end');
+      return 1;
+    });
+    const second = withStartAgentLock('agent-1', async () => {
+      // If the lock works, `first:end` has already been pushed.
+      if (!order.includes('first:end')) {
+        secondStartedBeforeFirstFinished = true;
+      }
+      order.push('second:start');
+      order.push('second:end');
+      return 2;
+    });
+
+    const [r1, r2] = await Promise.all([first, second]);
+    expect(r1).toBe(1);
+    expect(r2).toBe(2);
+    expect(secondStartedBeforeFirstFinished).toBe(false);
+    expect(order).toEqual(['first:start', 'first:end', 'second:start', 'second:end']);
+  });
+
+  it('runs concurrently for different agentIds', async () => {
+    const order: string[] = [];
+
+    const a = withStartAgentLock('agent-a', async () => {
+      order.push('a:start');
+      await new Promise(r => setTimeout(r, 20));
+      order.push('a:end');
+    });
+    const b = withStartAgentLock('agent-b', async () => {
+      order.push('b:start');
+      await new Promise(r => setTimeout(r, 20));
+      order.push('b:end');
+    });
+
+    await Promise.all([a, b]);
+
+    // Both should have started before either ended (no serialisation across ids).
+    expect(order.indexOf('b:start')).toBeLessThan(order.indexOf('a:end'));
+  });
+
+  it('releases the lock when the fn throws so subsequent callers can proceed', async () => {
+    await expect(
+      withStartAgentLock('agent-err', async () => {
+        throw new Error('boom');
+      })
+    ).rejects.toThrow('boom');
+
+    const result = await withStartAgentLock('agent-err', async () => 'ok');
+    expect(result).toBe('ok');
+  });
+});
diff --git a/services/gastown/container/src/process-manager.ts b/services/gastown/container/src/process-manager.ts
index 33a37fa29..02eb63a0f 100644
--- a/services/gastown/container/src/process-manager.ts
+++ b/services/gastown/container/src/process-manager.ts
@@ -71,6 +71,47 @@ export function isDraining(): boolean {
 // once created, the SDK instance is reused without locking.
 let sdkServerLock: Promise<void> = Promise.resolve();
 
+// Per-agentId mutex for startAgent. Without this, two concurrent POST
+// /agents/start calls for the same agentId (observed in production: two
+// `[control-server] /agents/start:` log lines at the same millisecond)
+// both pass the re-entrancy check at the top of startAgent before either
+// has committed a 'starting' record. The second invocation aborts the
+// first's startupAbortController and both paths race on session creation,
+// idle timers, and SDK instance reference counts — leaving the agent in
+// an inconsistent state (orphaned sessions, leaked sessionCount, etc).
+//
+// Serialising per agentId means the second caller waits for the first to
+// complete (or abort) before proceeding, and then observes a consistent
+// snapshot in `agents.get(agentId)`.
+const startAgentLocks = new Map<string, Promise<unknown>>();
+
+// Exported for tests that exercise the locking behaviour directly without
+// bringing up the whole SDK/process harness. Production callers should use
+// `startAgent` (which wraps `startAgentImpl` with this lock).
+export async function withStartAgentLock<T>(agentId: string, fn: () => Promise<T>): Promise<T> {
+  const previous = startAgentLocks.get(agentId) ?? Promise.resolve();
+  // Use the same explicit `new Promise` pattern as `sdkServerLock` above
+  // instead of `Promise.withResolvers`, which is not available on older
+  // Bun runtimes. This module is imported during container startup, so a
+  // missing global here would throw before the crash handlers are
+  // registered and prevent the control server from starting.
+  let releaseLock!: () => void;
+  const lockPromise = new Promise<void>(resolve => {
+    releaseLock = resolve;
+  });
+  startAgentLocks.set(agentId, lockPromise);
+  try {
+    await previous.catch(() => {});
+    return await fn();
+  } finally {
+    releaseLock();
+    // Only clear the slot if no newer caller has queued behind us.
+    if (startAgentLocks.get(agentId) === lockPromise) {
+      startAgentLocks.delete(agentId);
+    }
+  }
+}
+
 export function getUptime(): number {
   return Date.now() - startTime;
 }
@@ -1005,11 +1046,22 @@ async function subscribeToEvents(
 /**
  * Start an agent: ensure SDK server, create session, subscribe to events,
  * send initial prompt.
+ *
+ * Serialises concurrent callers for the same agentId so the re-entrancy
+ * handling inside `startAgentImpl` observes a consistent snapshot.
  */
 export async function startAgent(
   request: StartAgentRequest,
   workdir: string,
   env: Record<string, string>
+): Promise<ManagedAgent> {
+  return withStartAgentLock(request.agentId, () => startAgentImpl(request, workdir, env));
+}
+
+async function startAgentImpl(
+  request: StartAgentRequest,
+  workdir: string,
+  env: Record<string, string>
 ): Promise<ManagedAgent> {
   const existing = agents.get(request.agentId);
   if (existing && (existing.status === 'running' || existing.status === 'starting')) {