From 2e76951e82d6c1408a6fdb191fda275409b2c027 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Mon, 23 Mar 2026 23:24:34 -0500 Subject: [PATCH 1/5] =?UTF-8?q?fix(gastown):=20town=20containers=20never?= =?UTF-8?q?=20idle=20=E2=80=94=20mayor=20holds=20alarm=20at=205s,=20consta?= =?UTF-8?q?nt=20health=20checks=20reset=20sleep=20timer?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add 'waiting' agent status for mayors that finished processing a prompt and are waiting for user input. This lets hasActiveWork() return false, dropping the alarm from 5s to 5m and stopping health-check pings that reset the container's sleepAfter timer. Changes: - Add 'waiting' to AgentStatus enum (types.ts, agent-metadata.table.ts) - Container reports mayor idle via POST /agents/:id/waiting (completion-reporter.ts) - TownDO.mayorWaiting() transitions working → waiting - sendMayorMessage transitions waiting → working on new user input - Persist token refresh throttle in ctx.storage (survives DO eviction) - Increase idle alarm interval from 60s to 300s Closes #1450 --- .../container/src/completion-reporter.ts | 39 +++++ .../container/src/process-manager.ts | 7 +- .../src/db/tables/agent-metadata.table.ts | 2 +- cloudflare-gastown/src/dos/Town.do.ts | 62 +++++-- cloudflare-gastown/src/gastown.worker.ts | 6 + .../src/handlers/rig-agents.handler.ts | 16 ++ cloudflare-gastown/src/types.ts | 2 +- .../test/integration/mayor-idle.test.ts | 153 ++++++++++++++++++ 8 files changed, 274 insertions(+), 13 deletions(-) create mode 100644 cloudflare-gastown/test/integration/mayor-idle.test.ts diff --git a/cloudflare-gastown/container/src/completion-reporter.ts b/cloudflare-gastown/container/src/completion-reporter.ts index 8ad4be23bd..cfbd4f12bd 100644 --- a/cloudflare-gastown/container/src/completion-reporter.ts +++ b/cloudflare-gastown/container/src/completion-reporter.ts @@ -7,6 +7,45 @@ import type { ManagedAgent } from './types'; +/** + * Notify the TownDO that the mayor has finished processing a prompt and + * is now waiting for user input. This lets the TownDO transition the + * mayor from "working" to "waiting", which drops the alarm to the idle + * cadence and stops health-check pings that reset the container's + * sleepAfter timer. + * + * Best-effort: errors are logged but do not propagate. + */ +export async function reportMayorWaiting(agent: ManagedAgent): Promise { + const apiUrl = agent.gastownApiUrl; + const authToken = + process.env.GASTOWN_CONTAINER_TOKEN ?? agent.gastownContainerToken ?? agent.gastownSessionToken; + if (!apiUrl || !authToken) { + console.warn(`Cannot report mayor ${agent.agentId} waiting: no API credentials on agent record`); + return; + } + + const url = `${apiUrl}/api/towns/${agent.townId}/rigs/${agent.rigId}/agents/${agent.agentId}/waiting`; + try { + const response = await fetch(url, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + Authorization: `Bearer ${authToken}`, + }, + body: JSON.stringify({ agentId: agent.agentId }), + }); + + if (!response.ok) { + console.warn( + `Failed to report mayor ${agent.agentId} waiting: ${response.status} ${response.statusText}` + ); + } + } catch (err) { + console.warn(`Error reporting mayor ${agent.agentId} waiting:`, err); + } +} + /** * Notify the Rig DO that an agent session has completed or failed. * Best-effort: errors are logged but do not propagate. diff --git a/cloudflare-gastown/container/src/process-manager.ts b/cloudflare-gastown/container/src/process-manager.ts index f0008e320f..ad9bae0a44 100644 --- a/cloudflare-gastown/container/src/process-manager.ts +++ b/cloudflare-gastown/container/src/process-manager.ts @@ -9,7 +9,7 @@ import { createKilo, type KiloClient } from '@kilocode/sdk'; import { z } from 'zod'; import type { ManagedAgent, StartAgentRequest } from './types'; -import { reportAgentCompleted } from './completion-reporter'; +import { reportAgentCompleted, reportMayorWaiting } from './completion-reporter'; import { buildKiloConfigContent } from './agent-runner'; import { log } from './logger'; @@ -485,6 +485,11 @@ async function subscribeToEvents( if (event.type === 'session.idle') { if (request.role === 'mayor') { // Mayor agents are persistent — session.idle means "turn done", not exit. + // Notify the TownDO so it can transition the mayor to "waiting" + // (alive in container, not doing LLM work). This lets the alarm + // drop to the idle cadence and stops health-check pings that + // would reset the container's sleepAfter timer. + void reportMayorWaiting(agent); continue; } // Non-mayor: check for pending nudges before deciding to exit. diff --git a/cloudflare-gastown/src/db/tables/agent-metadata.table.ts b/cloudflare-gastown/src/db/tables/agent-metadata.table.ts index 6409e4c1a6..f9b9228dc8 100644 --- a/cloudflare-gastown/src/db/tables/agent-metadata.table.ts +++ b/cloudflare-gastown/src/db/tables/agent-metadata.table.ts @@ -5,7 +5,7 @@ import { getTableFromZodSchema, getCreateTableQueryFromTable } from '../../util/ // queries parsing through AgentMetadataRecord don't throw on old rows. // Application code should only create the known roles below. const AgentRole = z.enum(['polecat', 'refinery', 'mayor']).or(z.string()); -const AgentProcessStatus = z.enum(['idle', 'working', 'stalled', 'dead']).or(z.string()); +const AgentProcessStatus = z.enum(['idle', 'working', 'waiting', 'stalled', 'dead']).or(z.string()); export const AgentMetadataRecord = z.object({ bead_id: z.string(), diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index b3eeaa158c..c7db07e31b 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -120,7 +120,7 @@ function formatEventMessage(row: Record): string { // Alarm intervals const ACTIVE_ALARM_INTERVAL_MS = 5_000; // 5s when agents are active -const IDLE_ALARM_INTERVAL_MS = 1 * 60_000; // 1m when idle +const IDLE_ALARM_INTERVAL_MS = 5 * 60_000; // 5m when idle (no working agents) // Escalation constants const STALE_ESCALATION_THRESHOLD_MS = 4 * 60 * 60 * 1000; @@ -592,7 +592,7 @@ export class TownDO extends DurableObject { const townConfig = await this.getTownConfig(); const userId = townConfig.owner_user_id ?? townId; await dispatch.forceRefreshContainerToken(this.env, townId, userId); - this.lastContainerTokenRefreshAt = Date.now(); + await this.ctx.storage.put('container:lastTokenRefreshAt', Date.now()); } /** @@ -1407,6 +1407,32 @@ export class TownDO extends DurableObject { await this.armAlarmIfNeeded(); } + /** + * Transition the mayor from "working" to "waiting". Called by the + * container when the mayor's session goes idle (turn done, waiting for + * user input). The "waiting" status means the mayor is alive in the + * container but not doing LLM work — hasActiveWork() returns false, + * so the alarm drops to the idle cadence and health-check pings stop + * resetting the container's sleepAfter timer. + */ + async mayorWaiting(agentId?: string): Promise { + let resolvedAgentId = agentId; + if (!resolvedAgentId) { + const mayor = agents.listAgents(this.sql, { role: 'mayor' })[0]; + if (mayor) resolvedAgentId = mayor.id; + } + if (!resolvedAgentId) return; + + const agent = agents.getAgent(this.sql, resolvedAgentId); + if (!agent || agent.role !== 'mayor') return; + + // Only transition from working → waiting. If the agent has already + // been set to idle/stalled/dead by another path, don't overwrite. + if (agent.status === 'working') { + agents.updateAgentStatus(this.sql, resolvedAgentId, 'waiting'); + } + } + async agentCompleted( agentId: string, input: { status: 'completed' | 'failed'; reason?: string } @@ -1892,7 +1918,16 @@ export class TownDO extends DurableObject { if (isAlive) { const sent = await dispatch.sendMessageToAgent(this.env, townId, mayor.id, combinedMessage); - sessionStatus = sent ? 'active' : 'idle'; + if (sent) { + // Transition waiting → working so the alarm runs at the active cadence + // while the mayor processes this prompt. + if (mayor.status === 'waiting') { + agents.updateAgentStatus(this.sql, mayor.id, 'working'); + } + sessionStatus = 'active'; + } else { + sessionStatus = 'idle'; + } } else { const townConfig = await this.getTownConfig(); const rigConfig = await this.getMayorRigConfig(); @@ -1986,8 +2021,8 @@ export class TownDO extends DurableObject { const isAlive = containerStatus.status === 'running' || containerStatus.status === 'starting'; if (isAlive) { - const status = mayor.status === 'working' || mayor.status === 'stalled' ? 'active' : 'idle'; - return { agentId: mayor.id, sessionStatus: status }; + const isActive = mayor.status === 'working' || mayor.status === 'stalled' || mayor.status === 'waiting'; + return { agentId: mayor.id, sessionStatus: isActive ? 'active' : 'idle' }; } // Start the container with an idle mayor (no initial prompt) @@ -2103,7 +2138,7 @@ export class TownDO extends DurableObject { const mapStatus = (agentStatus: string): 'idle' | 'active' | 'starting' => { switch (agentStatus) { case 'working': - return 'active'; + case 'waiting': case 'stalled': return 'active'; default: @@ -3282,12 +3317,18 @@ export class TownDO extends DurableObject { * from the alarm handler, throttled to once per hour (tokens have * 8h expiry). The TownContainerDO stores it as an env var so it's * available to all agents in the container. + * + * The throttle timestamp is persisted in ctx.storage so it survives + * DO eviction. Without persistence, eviction resets the throttle to 0 + * and the refresh fires immediately on the next alarm tick, sending + * requests that reset the container's sleepAfter timer (#1409). */ - private lastContainerTokenRefreshAt = 0; private async refreshContainerToken(): Promise { const TOKEN_REFRESH_INTERVAL_MS = 60 * 60_000; // 1 hour const now = Date.now(); - if (now - this.lastContainerTokenRefreshAt < TOKEN_REFRESH_INTERVAL_MS) return; + const lastRefresh = + (await this.ctx.storage.get('container:lastTokenRefreshAt')) ?? 0; + if (now - lastRefresh < TOKEN_REFRESH_INTERVAL_MS) return; const townId = this.townId; if (!townId) return; @@ -3296,7 +3337,7 @@ export class TownDO extends DurableObject { await dispatch.refreshContainerToken(this.env, townId, userId); // Only mark as refreshed after success — failed refreshes should // be retried on the next alarm tick, not throttled for an hour. - this.lastContainerTokenRefreshAt = now; + await this.ctx.storage.put('container:lastTokenRefreshAt', now); } private hasActiveWork(): boolean { @@ -3747,6 +3788,7 @@ export class TownDO extends DurableObject { }; agents: { working: number; + waiting: number; idle: number; stalled: number; dead: number; @@ -3788,7 +3830,7 @@ export class TownDO extends DurableObject { [] ), ]; - const agentCounts = { working: 0, idle: 0, stalled: 0, dead: 0, total: 0 }; + const agentCounts = { working: 0, waiting: 0, idle: 0, stalled: 0, dead: 0, total: 0 }; for (const row of agentRows) { const s = `${row.status as string}`; const c = Number(row.cnt); diff --git a/cloudflare-gastown/src/gastown.worker.ts b/cloudflare-gastown/src/gastown.worker.ts index 32d676e0af..901b9f924d 100644 --- a/cloudflare-gastown/src/gastown.worker.ts +++ b/cloudflare-gastown/src/gastown.worker.ts @@ -36,6 +36,7 @@ import { handleAgentDone, handleRequestChanges, handleAgentCompleted, + handleAgentWaiting, handleWriteCheckpoint, handleCheckMail, handleHeartbeat, @@ -377,6 +378,11 @@ app.post('/api/towns/:townId/rigs/:rigId/agents/:agentId/completed', c => handleAgentCompleted(c, c.req.param()) ) ); +app.post('/api/towns/:townId/rigs/:rigId/agents/:agentId/waiting', c => + instrumented(c, 'POST /api/towns/:townId/rigs/:rigId/agents/:agentId/waiting', () => + handleAgentWaiting(c, c.req.param()) + ) +); app.post('/api/towns/:townId/rigs/:rigId/agents/:agentId/checkpoint', c => instrumented(c, 'POST /api/towns/:townId/rigs/:rigId/agents/:agentId/checkpoint', () => handleWriteCheckpoint(c, c.req.param()) diff --git a/cloudflare-gastown/src/handlers/rig-agents.handler.ts b/cloudflare-gastown/src/handlers/rig-agents.handler.ts index 24c429a4c9..768c06229f 100644 --- a/cloudflare-gastown/src/handlers/rig-agents.handler.ts +++ b/cloudflare-gastown/src/handlers/rig-agents.handler.ts @@ -162,6 +162,22 @@ export async function handleAgentCompleted( return c.json(resSuccess({ completed: true })); } +/** + * Called by the container when the mayor's session goes idle (turn done, + * waiting for user input). Transitions the mayor from "working" to + * "waiting" so the alarm drops to the idle cadence and health-check + * pings stop resetting the container's sleepAfter timer. + */ +export async function handleAgentWaiting( + c: Context, + params: { rigId: string; agentId: string } +) { + const townId = c.get('townId'); + const town = getTownDOStub(c.env, townId); + await town.mayorWaiting(params.agentId); + return c.json(resSuccess({ acknowledged: true })); +} + export async function handleWriteCheckpoint( c: Context, params: { rigId: string; agentId: string } diff --git a/cloudflare-gastown/src/types.ts b/cloudflare-gastown/src/types.ts index f59b3d2169..2450970358 100644 --- a/cloudflare-gastown/src/types.ts +++ b/cloudflare-gastown/src/types.ts @@ -51,7 +51,7 @@ export type BeadFilter = { export const AgentRole = z.enum(['polecat', 'refinery', 'mayor']); export type AgentRole = z.infer; -export const AgentStatus = z.enum(['idle', 'working', 'stalled', 'dead']); +export const AgentStatus = z.enum(['idle', 'working', 'waiting', 'stalled', 'dead']); export type AgentStatus = z.infer; /** diff --git a/cloudflare-gastown/test/integration/mayor-idle.test.ts b/cloudflare-gastown/test/integration/mayor-idle.test.ts new file mode 100644 index 0000000000..c12527a4ae --- /dev/null +++ b/cloudflare-gastown/test/integration/mayor-idle.test.ts @@ -0,0 +1,153 @@ +/** + * Integration tests for the mayor idle (waiting) lifecycle. + * + * Verifies that: + * 1. The "waiting" agent status exists and can be set + * 2. hasActiveWork() returns false when the only agent is a waiting mayor + * 3. The alarm interval drops to idle cadence when the mayor is waiting + * 4. mayorWaiting() transitions a working mayor to waiting + * 5. sendMayorMessage transitions a waiting mayor back to working (when container is alive) + * 6. Token refresh throttle persists across DO eviction (ctx.storage) + */ + +import { env, runDurableObjectAlarm } from 'cloudflare:test'; +import { describe, it, expect, beforeEach } from 'vitest'; + +function getTownStub(name = 'test-town') { + const id = env.TOWN.idFromName(name); + return env.TOWN.get(id); +} + +describe('Mayor idle lifecycle', () => { + let town: ReturnType; + let townName: string; + + beforeEach(async () => { + townName = `mayor-idle-${crypto.randomUUID()}`; + town = getTownStub(townName); + await town.setTownId(townName); + await town.addRig({ + rigId: 'rig-1', + name: 'main-rig', + gitUrl: 'https://github.com/test/repo.git', + defaultBranch: 'main', + }); + }); + + // ── waiting status ────────────────────────────────────────────────── + + describe('waiting status', () => { + it('should allow setting an agent to waiting', async () => { + // Register a mayor agent directly + const agentsBefore = await town.listAgents({ role: 'mayor' }); + expect(agentsBefore.length).toBe(0); + + // ensureMayor creates the agent (won't start container in test env) + const result = await town.ensureMayor(); + expect(result.agentId).toBeTruthy(); + + // Set the agent to working first, then waiting + await town.updateAgentStatus(result.agentId, 'working'); + const workingAgent = await town.getAgentAsync(result.agentId); + expect(workingAgent?.status).toBe('working'); + + // mayorWaiting should transition working → waiting + await town.mayorWaiting(result.agentId); + const waitingAgent = await town.getAgentAsync(result.agentId); + expect(waitingAgent?.status).toBe('waiting'); + }); + + it('should not transition non-working agents to waiting', async () => { + const result = await town.ensureMayor(); + + // Agent starts as idle (container not running in test env) + const agent = await town.getAgentAsync(result.agentId); + expect(agent?.status).toBe('idle'); + + // mayorWaiting should NOT change idle to waiting + await town.mayorWaiting(result.agentId); + const afterAgent = await town.getAgentAsync(result.agentId); + expect(afterAgent?.status).toBe('idle'); + }); + + it('should resolve empty agentId to the mayor', async () => { + const result = await town.ensureMayor(); + await town.updateAgentStatus(result.agentId, 'working'); + + // Call with undefined agentId — should resolve to mayor + await town.mayorWaiting(); + const agent = await town.getAgentAsync(result.agentId); + expect(agent?.status).toBe('waiting'); + }); + }); + + // ── hasActiveWork / alarm interval ────────────────────────────────── + + describe('alarm interval with waiting mayor', () => { + it('should use idle alarm interval when mayor is waiting', async () => { + const result = await town.ensureMayor(); + + // Set mayor to working → alarm should be active (5s) + await town.updateAgentStatus(result.agentId, 'working'); + const activeStatus = await town.getAlarmStatus(); + expect(activeStatus.alarm.intervalMs).toBe(5_000); + + // Set mayor to waiting → alarm should drop to idle (5 min) + await town.updateAgentStatus(result.agentId, 'waiting'); + const idleStatus = await town.getAlarmStatus(); + expect(idleStatus.alarm.intervalMs).toBe(5 * 60_000); + }); + + it('should use active alarm interval when a polecat is working alongside a waiting mayor', async () => { + const result = await town.ensureMayor(); + await town.updateAgentStatus(result.agentId, 'waiting'); + + // Create a convoy to get a working polecat + const convoy = await town.slingConvoy({ + rigId: 'rig-1', + convoyTitle: 'Test', + tasks: [{ title: 'Task 1' }], + }); + + // Run alarm to assign and dispatch the polecat + await runDurableObjectAlarm(town); + + const bead = await town.getBeadAsync(convoy.beads[0].bead.bead_id); + expect(bead?.assignee_agent_bead_id).toBeTruthy(); + + // Set the polecat to working + if (bead?.assignee_agent_bead_id) { + await town.updateAgentStatus(bead.assignee_agent_bead_id, 'working'); + } + + // Now alarm should be active (polecat is working) + const status = await town.getAlarmStatus(); + expect(status.alarm.intervalMs).toBe(5_000); + }); + }); + + // ── getMayorStatus mapping ───────────────────────────────────────── + + describe('getMayorStatus', () => { + it('should report waiting mayor as active', async () => { + const result = await town.ensureMayor(); + await town.updateAgentStatus(result.agentId, 'waiting'); + + const status = await town.getMayorStatus(); + expect(status.session?.status).toBe('active'); + }); + }); + + // ── getAlarmStatus agent counts ──────────────────────────────────── + + describe('getAlarmStatus agent counts', () => { + it('should include waiting in agent counts', async () => { + const result = await town.ensureMayor(); + await town.updateAgentStatus(result.agentId, 'waiting'); + + const status = await town.getAlarmStatus(); + expect(status.agents.waiting).toBe(1); + expect(status.agents.working).toBe(0); + }); + }); +}); From 99e9ab023d568acb6479c6e563b4adc15ed5617f Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Tue, 24 Mar 2026 08:27:35 -0500 Subject: [PATCH 2/5] =?UTF-8?q?fix:=20reschedule=20alarm=20to=20active=20c?= =?UTF-8?q?adence=20when=20mayor=20transitions=20waiting=20=E2=86=92=20wor?= =?UTF-8?q?king?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When sendMayorMessage transitions the mayor from waiting to working, the idle alarm may be up to 5 min away. Explicitly reschedule to ACTIVE_ALARM_INTERVAL_MS (5s) so the reconciler/health-check loop resumes promptly. --- cloudflare-gastown/src/dos/Town.do.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index c7db07e31b..bb4ace8042 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -1920,9 +1920,12 @@ export class TownDO extends DurableObject { const sent = await dispatch.sendMessageToAgent(this.env, townId, mayor.id, combinedMessage); if (sent) { // Transition waiting → working so the alarm runs at the active cadence - // while the mayor processes this prompt. + // while the mayor processes this prompt. Also reschedule the alarm + // immediately — the idle alarm may be up to 5 min away, and we need + // the reconciler/health-check loop to resume promptly. if (mayor.status === 'waiting') { agents.updateAgentStatus(this.sql, mayor.id, 'working'); + await this.ctx.storage.setAlarm(Date.now() + ACTIVE_ALARM_INTERVAL_MS); } sessionStatus = 'active'; } else { From 9eb602a39c266b3085067a03e38f13d044e497b0 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Tue, 24 Mar 2026 11:26:15 -0500 Subject: [PATCH 3/5] style: fix formatting in Town.do.ts and completion-reporter.ts --- cloudflare-gastown/container/src/completion-reporter.ts | 4 +++- cloudflare-gastown/src/dos/Town.do.ts | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/cloudflare-gastown/container/src/completion-reporter.ts b/cloudflare-gastown/container/src/completion-reporter.ts index cfbd4f12bd..492a41d57b 100644 --- a/cloudflare-gastown/container/src/completion-reporter.ts +++ b/cloudflare-gastown/container/src/completion-reporter.ts @@ -21,7 +21,9 @@ export async function reportMayorWaiting(agent: ManagedAgent): Promise { const authToken = process.env.GASTOWN_CONTAINER_TOKEN ?? agent.gastownContainerToken ?? agent.gastownSessionToken; if (!apiUrl || !authToken) { - console.warn(`Cannot report mayor ${agent.agentId} waiting: no API credentials on agent record`); + console.warn( + `Cannot report mayor ${agent.agentId} waiting: no API credentials on agent record` + ); return; } diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index bb4ace8042..c1446fb6fc 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -2024,7 +2024,8 @@ export class TownDO extends DurableObject { const isAlive = containerStatus.status === 'running' || containerStatus.status === 'starting'; if (isAlive) { - const isActive = mayor.status === 'working' || mayor.status === 'stalled' || mayor.status === 'waiting'; + const isActive = + mayor.status === 'working' || mayor.status === 'stalled' || mayor.status === 'waiting'; return { agentId: mayor.id, sessionStatus: isActive ? 'active' : 'idle' }; } @@ -3329,8 +3330,7 @@ export class TownDO extends DurableObject { private async refreshContainerToken(): Promise { const TOKEN_REFRESH_INTERVAL_MS = 60 * 60_000; // 1 hour const now = Date.now(); - const lastRefresh = - (await this.ctx.storage.get('container:lastTokenRefreshAt')) ?? 0; + const lastRefresh = (await this.ctx.storage.get('container:lastTokenRefreshAt')) ?? 0; if (now - lastRefresh < TOKEN_REFRESH_INTERVAL_MS) return; const townId = this.townId; From b754fdd2aa009ba627e02501a42679379aad534b Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Tue, 24 Mar 2026 11:33:35 -0500 Subject: [PATCH 4/5] fix: update idle intervalLabel and guard against stale session.idle callbacks - Fix intervalLabel in getAlarmStatus() to show 'idle (5m)' instead of 'idle (60s)' to match the new IDLE_ALARM_INTERVAL_MS. - Add _mayorWorkingSince timestamp guard in mayorWaiting() to reject stale session.idle callbacks that arrive after sendMayorMessage has already re-activated the mayor. The 5s grace window covers the round-trip time for fire-and-forget reportMayorWaiting() calls. --- cloudflare-gastown/src/dos/Town.do.ts | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index c1446fb6fc..824f3a3e90 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -537,6 +537,10 @@ export class TownDO extends DurableObject { private _townId: string | null = null; private _lastReconcilerMetrics: reconciler.ReconcilerMetrics | null = null; private _dashboardContext: string | null = null; + /** Monotonic timestamp of the last working → transition for the mayor. + * Used to reject stale session.idle callbacks that arrive after a new + * prompt has already re-activated the mayor. */ + private _mayorWorkingSince = 0; private get townId(): string { return this._townId ?? this.ctx.id.name ?? this.ctx.id.toString(); @@ -1428,7 +1432,13 @@ export class TownDO extends DurableObject { // Only transition from working → waiting. If the agent has already // been set to idle/stalled/dead by another path, don't overwrite. + // Guard against stale session.idle callbacks: reportMayorWaiting is + // fire-and-forget, so a callback from the previous turn can arrive + // after sendMayorMessage has already re-activated the mayor. Reject + // transitions that arrive within 5s of the mayor being set to working. if (agent.status === 'working') { + const STALE_GUARD_MS = 5_000; + if (Date.now() - this._mayorWorkingSince < STALE_GUARD_MS) return; agents.updateAgentStatus(this.sql, resolvedAgentId, 'waiting'); } } @@ -1925,6 +1935,7 @@ export class TownDO extends DurableObject { // the reconciler/health-check loop to resume promptly. if (mayor.status === 'waiting') { agents.updateAgentStatus(this.sql, mayor.id, 'working'); + this._mayorWorkingSince = Date.now(); await this.ctx.storage.setAlarm(Date.now() + ACTIVE_ALARM_INTERVAL_MS); } sessionStatus = 'active'; @@ -1976,6 +1987,7 @@ export class TownDO extends DurableObject { if (started) { agents.updateAgentStatus(this.sql, mayor.id, 'working'); + this._mayorWorkingSince = Date.now(); sessionStatus = 'starting'; } else { sessionStatus = 'idle'; @@ -2078,6 +2090,7 @@ export class TownDO extends DurableObject { if (started) { agents.updateAgentStatus(this.sql, mayor.id, 'working'); + this._mayorWorkingSince = Date.now(); return { agentId: mayor.id, sessionStatus: 'starting' }; } @@ -3954,7 +3967,7 @@ export class TownDO extends DurableObject { alarm: { nextFireAt: currentAlarm ? new Date(Number(currentAlarm)).toISOString() : null, intervalMs, - intervalLabel: active ? 'active (5s)' : 'idle (60s)', + intervalLabel: active ? 'active (5s)' : 'idle (5m)', }, agents: agentCounts, beads: beadCounts, From 6db3a1ee49606ab026bf2e4022fdb6d05c716c98 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Tue, 31 Mar 2026 10:39:35 -0500 Subject: [PATCH 5/5] fix: replace time-based stale guard with firedAt timestamp comparison The 5s time-based guard in mayorWaiting() rejected legitimate fast turns where the mayor finishes a prompt in under 5s. Replace with a firedAt timestamp that the container includes in the reportMayorWaiting callback. mayorWaiting() now only rejects callbacks whose firedAt predates the last working transition (_mayorWorkingSince), correctly distinguishing stale callbacks from current ones regardless of turn duration. --- .../container/src/completion-reporter.ts | 2 +- cloudflare-gastown/src/dos/Town.do.ts | 17 +++++++++++------ .../src/handlers/rig-agents.handler.ts | 4 +++- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/cloudflare-gastown/container/src/completion-reporter.ts b/cloudflare-gastown/container/src/completion-reporter.ts index 492a41d57b..7a7766e61f 100644 --- a/cloudflare-gastown/container/src/completion-reporter.ts +++ b/cloudflare-gastown/container/src/completion-reporter.ts @@ -35,7 +35,7 @@ export async function reportMayorWaiting(agent: ManagedAgent): Promise { 'Content-Type': 'application/json', Authorization: `Bearer ${authToken}`, }, - body: JSON.stringify({ agentId: agent.agentId }), + body: JSON.stringify({ agentId: agent.agentId, firedAt: Date.now() }), }); if (!response.ok) { diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index 824f3a3e90..4c65d75ddc 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -1418,8 +1418,13 @@ export class TownDO extends DurableObject { * container but not doing LLM work — hasActiveWork() returns false, * so the alarm drops to the idle cadence and health-check pings stop * resetting the container's sleepAfter timer. + * + * @param firedAt - Timestamp (ms) when the container fired this + * callback. Used to reject stale session.idle callbacks from a + * previous turn that arrive after the mayor has already been + * re-activated by a new prompt. */ - async mayorWaiting(agentId?: string): Promise { + async mayorWaiting(agentId?: string, firedAt?: number): Promise { let resolvedAgentId = agentId; if (!resolvedAgentId) { const mayor = agents.listAgents(this.sql, { role: 'mayor' })[0]; @@ -1433,12 +1438,12 @@ export class TownDO extends DurableObject { // Only transition from working → waiting. If the agent has already // been set to idle/stalled/dead by another path, don't overwrite. // Guard against stale session.idle callbacks: reportMayorWaiting is - // fire-and-forget, so a callback from the previous turn can arrive - // after sendMayorMessage has already re-activated the mayor. Reject - // transitions that arrive within 5s of the mayor being set to working. + // fire-and-forget, so a callback from a previous turn can arrive + // after sendMayorMessage has already re-activated the mayor. If the + // callback carries a firedAt timestamp that predates the last + // working transition, it belongs to an older turn — reject it. if (agent.status === 'working') { - const STALE_GUARD_MS = 5_000; - if (Date.now() - this._mayorWorkingSince < STALE_GUARD_MS) return; + if (firedAt && firedAt < this._mayorWorkingSince) return; agents.updateAgentStatus(this.sql, resolvedAgentId, 'waiting'); } } diff --git a/cloudflare-gastown/src/handlers/rig-agents.handler.ts b/cloudflare-gastown/src/handlers/rig-agents.handler.ts index 768c06229f..a0651583d7 100644 --- a/cloudflare-gastown/src/handlers/rig-agents.handler.ts +++ b/cloudflare-gastown/src/handlers/rig-agents.handler.ts @@ -172,9 +172,11 @@ export async function handleAgentWaiting( c: Context, params: { rigId: string; agentId: string } ) { + const body = (await parseJsonBody(c)) as Record; + const firedAt = typeof body?.firedAt === 'number' ? body.firedAt : undefined; const townId = c.get('townId'); const town = getTownDOStub(c.env, townId); - await town.mayorWaiting(params.agentId); + await town.mayorWaiting(params.agentId, firedAt); return c.json(resSuccess({ acknowledged: true })); }