From baf2b66bce4a6912b4eb1ba49172c40a6c56cc33 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Tue, 10 Mar 2026 14:18:12 -0500 Subject: [PATCH 1/4] fix(patrol): address review feedback on triage cooldown and crash-loop exclusion - Replace ineffective last_activity_at cooldown with a batch-bead-based cooldown: maybeDispatchTriageAgent now skips dispatch when a failed gt:triage batch bead exists within DISPATCH_COOLDOWN_MS. - Fix crash-loop exclusion to check the failed bead's labels instead of current_hook_bead_id (which is NULL after unhook), preventing unhooked triage agents from slipping through detection. --- cloudflare-gastown/src/dos/Town.do.ts | 33 +++++++++++------------ cloudflare-gastown/src/dos/town/patrol.ts | 14 +++++----- 2 files changed, 23 insertions(+), 24 deletions(-) diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index 4f34fde7c6..0db3741993 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -2464,8 +2464,13 @@ export class TownDO extends DurableObject { if (pendingCount === 0) return; // Check if a triage batch bead is already in progress (meaning a - // triage agent is working). We can't filter by role since triage - // uses polecat role; instead check for an open gt:triage batch bead. + // triage agent is working), or recently failed (cooldown to prevent + // rapid retry loops). Skip dispatch in either case. + const triageBatchLike = patrol.TRIAGE_LABEL_LIKE.replace( + patrol.TRIAGE_REQUEST_LABEL, + patrol.TRIAGE_BATCH_LABEL + ); + const cooldownCutoff = new Date(Date.now() - DISPATCH_COOLDOWN_MS).toISOString(); const existingBatch = [ ...query( this.sql, @@ -2473,16 +2478,19 @@ export class TownDO extends DurableObject { SELECT ${beads.bead_id} FROM ${beads} WHERE ${beads.type} = 'issue' AND ${beads.labels} LIKE ? - AND ${beads.status} IN ('open', 'in_progress') AND ${beads.created_by} = 'patrol' + AND ( + ${beads.status} IN ('open', 'in_progress') + OR (${beads.status} = 'failed' AND ${beads.updated_at} > ?) + ) LIMIT 1 `, - [patrol.TRIAGE_LABEL_LIKE.replace(patrol.TRIAGE_REQUEST_LABEL, patrol.TRIAGE_BATCH_LABEL)] + [triageBatchLike, cooldownCutoff] ), ]; if (existingBatch.length > 0) { console.log( - `${TOWN_LOG} maybeDispatchTriageAgent: triage agent already working, skipping (${pendingCount} pending)` + `${TOWN_LOG} maybeDispatchTriageAgent: triage batch bead active or in cooldown, skipping (${pendingCount} pending)` ); return; } @@ -2555,19 +2563,10 @@ export class TownDO extends DurableObject { agents.updateAgentStatus(this.sql, triageAgent.id, 'working'); } else { agents.unhookBead(this.sql, triageAgent.id); + // Failing the batch bead triggers cooldown: the guard at the top of + // this method skips dispatch while a failed batch bead's updated_at + // is within DISPATCH_COOLDOWN_MS. beadOps.updateBeadStatus(this.sql, triageBead.bead_id, 'failed', triageAgent.id); - // Apply dispatch cooldown so the next alarm tick doesn't immediately - // retry. Setting last_activity_at = now() makes the agent invisible - // to schedulePendingWork for DISPATCH_COOLDOWN_MS (2 min). - query( - this.sql, - /* sql */ ` - UPDATE ${agent_metadata} - SET ${agent_metadata.columns.last_activity_at} = ? - WHERE ${agent_metadata.bead_id} = ? - `, - [now(), triageAgent.id] - ); console.error(`${TOWN_LOG} maybeDispatchTriageAgent: triage agent failed to start`); } } diff --git a/cloudflare-gastown/src/dos/town/patrol.ts b/cloudflare-gastown/src/dos/town/patrol.ts index e8cc2de374..db3966fbec 100644 --- a/cloudflare-gastown/src/dos/town/patrol.ts +++ b/cloudflare-gastown/src/dos/town/patrol.ts @@ -585,8 +585,10 @@ export function detectCrashLoops(sql: SqlStorage): void { // Exclude triage agents from crash loop detection — their failures must // not create new triage requests, which would feed the feedback loop. - // An agent is considered a triage agent if its current hooked bead has - // the gt:triage or gt:triage-request label (both start with "gt:triage"). + // We check whether the *failed bead itself* carries a triage label + // (gt:triage or gt:triage-request). This is stable even after unhook + // clears current_hook_bead_id, because the bead_event.bead_id still + // points to the triage batch bead that was failed. const TRIAGE_LABEL_ANY = `%"gt:triage%`; const rows = CrashRow.array().parse([ @@ -600,11 +602,9 @@ export function detectCrashLoops(sql: SqlStorage): void { AND be.agent_id IS NOT NULL AND be.created_at > ? AND NOT EXISTS ( - SELECT 1 FROM ${agent_metadata} - INNER JOIN ${beads} AS hooked - ON ${agent_metadata.current_hook_bead_id} = hooked.${beads.columns.bead_id} - WHERE ${agent_metadata.bead_id} = be.agent_id - AND hooked.${beads.columns.labels} LIKE ? + SELECT 1 FROM ${beads} AS failed_bead + WHERE failed_bead.${beads.columns.bead_id} = be.bead_id + AND failed_bead.${beads.columns.labels} LIKE ? ) GROUP BY be.agent_id HAVING fail_count >= ? From 8b0ea6b72914d7c6b0f20dbb79b8c6ddfb6f815b Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Tue, 10 Mar 2026 19:27:30 -0500 Subject: [PATCH 2/4] fix(patrol): exempt escalation triage requests from global cap Escalations are agent/user-initiated and must not be silently dropped by the global cap, which exists to prevent feedback loops from patrol's automatic detections. Without this, low-severity escalations that don't notify the mayor would sit unprocessed with no triage follow-up. --- cloudflare-gastown/src/dos/town/patrol.ts | 43 +++++++++++++---------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/cloudflare-gastown/src/dos/town/patrol.ts b/cloudflare-gastown/src/dos/town/patrol.ts index db3966fbec..f880a7631c 100644 --- a/cloudflare-gastown/src/dos/town/patrol.ts +++ b/cloudflare-gastown/src/dos/town/patrol.ts @@ -108,25 +108,30 @@ export function createTriageRequest( } // Global cap: skip if there are already too many open triage requests. - // Prevents unbounded accumulation during feedback loops. - const openCountRows = [ - ...query( - sql, - /* sql */ ` - SELECT COUNT(*) AS cnt FROM ${beads} - WHERE ${beads.type} = 'issue' - AND ${beads.labels} LIKE ? - AND ${beads.status} = 'open' - `, - [TRIAGE_LABEL_LIKE] - ), - ]; - const openCount = Number(z.object({ cnt: z.number() }).parse(openCountRows[0]).cnt); - if (openCount >= MAX_OPEN_TRIAGE_REQUESTS) { - console.warn( - `${LOG} createTriageRequest: global cap reached (${openCount} open), skipping type=${params.triageType}` - ); - return; + // Prevents unbounded accumulation during feedback loops from patrol's + // automatic detections. Escalations are exempt — they are agent/user + // initiated and silently dropping them would leave the escalation bead + // open with no automated follow-up. + if (params.triageType !== 'escalation') { + const openCountRows = [ + ...query( + sql, + /* sql */ ` + SELECT COUNT(*) AS cnt FROM ${beads} + WHERE ${beads.type} = 'issue' + AND ${beads.labels} LIKE ? + AND ${beads.status} = 'open' + `, + [TRIAGE_LABEL_LIKE] + ), + ]; + const openCount = Number(z.object({ cnt: z.number() }).parse(openCountRows[0]).cnt); + if (openCount >= MAX_OPEN_TRIAGE_REQUESTS) { + console.warn( + `${LOG} createTriageRequest: global cap reached (${openCount} open), skipping type=${params.triageType}` + ); + return; + } } const metadata: TriageRequestMetadata = { From 6eb2cf7ec3589821b5f2e2c0eaa7d96a60ce704a Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Wed, 11 Mar 2026 08:54:10 -0500 Subject: [PATCH 3/4] fix: remove triage from public AgentRole enum The triage role is only used as a container dispatch-time signal by maybeDispatchTriageAgent, not as a user-facing role. Including it in AgentRole exposed it via the public API (POST /agents, POST /get-or-create-agent), allowing external callers to create agents that skip repo clone and cannot work normal coding beads. --- cloudflare-gastown/src/types.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloudflare-gastown/src/types.ts b/cloudflare-gastown/src/types.ts index 5cd6c6d6fb..a775283462 100644 --- a/cloudflare-gastown/src/types.ts +++ b/cloudflare-gastown/src/types.ts @@ -48,7 +48,7 @@ export type BeadFilter = { // -- Agents (now beads + agent_metadata) -- -export const AgentRole = z.enum(['polecat', 'refinery', 'mayor', 'triage']); +export const AgentRole = z.enum(['polecat', 'refinery', 'mayor']); export type AgentRole = z.infer; export const AgentStatus = z.enum(['idle', 'working', 'stalled', 'dead']); From 8688e4281f737c9c02d15e332ec98a1a113c9b34 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Wed, 11 Mar 2026 09:26:55 -0500 Subject: [PATCH 4/4] fix(patrol): exclude escalations from cap count and harden crash-loop filter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Exclude escalation-type triage requests from the global cap COUNT query so escalation backlog doesn't suppress patrol's automatic detections (crash_loop, stuck_agent, etc.). - Add second NOT EXISTS clause to detectCrashLoops that checks if the agent is currently hooked to a triage-labeled bead — covers the case where resolveTriage CLOSE_BEAD fails an ordinary bead with the triage agent as the actor. --- cloudflare-gastown/src/dos/town/patrol.ts | 29 +++++++++++++++-------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/cloudflare-gastown/src/dos/town/patrol.ts b/cloudflare-gastown/src/dos/town/patrol.ts index f880a7631c..b7e6298a4a 100644 --- a/cloudflare-gastown/src/dos/town/patrol.ts +++ b/cloudflare-gastown/src/dos/town/patrol.ts @@ -107,11 +107,10 @@ export function createTriageRequest( if (existing.length > 0) return; } - // Global cap: skip if there are already too many open triage requests. - // Prevents unbounded accumulation during feedback loops from patrol's - // automatic detections. Escalations are exempt — they are agent/user - // initiated and silently dropping them would leave the escalation bead - // open with no automated follow-up. + // Global cap: skip if there are already too many open *automatic* triage + // requests (patrol-generated). Escalations are exempt from both the gate + // and the count — they are agent/user initiated and silently dropping + // them would leave the escalation bead with no automated follow-up. if (params.triageType !== 'escalation') { const openCountRows = [ ...query( @@ -121,6 +120,7 @@ export function createTriageRequest( WHERE ${beads.type} = 'issue' AND ${beads.labels} LIKE ? AND ${beads.status} = 'open' + AND json_extract(${beads.metadata}, '$.triage_type') != 'escalation' `, [TRIAGE_LABEL_LIKE] ), @@ -590,10 +590,12 @@ export function detectCrashLoops(sql: SqlStorage): void { // Exclude triage agents from crash loop detection — their failures must // not create new triage requests, which would feed the feedback loop. - // We check whether the *failed bead itself* carries a triage label - // (gt:triage or gt:triage-request). This is stable even after unhook - // clears current_hook_bead_id, because the bead_event.bead_id still - // points to the triage batch bead that was failed. + // Two complementary checks: + // 1. The failed bead itself carries a triage label (covers triage batch + // bead failures, stable after unhook clears current_hook_bead_id). + // 2. The agent is currently hooked to a triage-labeled bead (covers + // resolveTriage actions like CLOSE_BEAD that fail ordinary beads + // while the triage agent is still working its batch). const TRIAGE_LABEL_ANY = `%"gt:triage%`; const rows = CrashRow.array().parse([ @@ -611,10 +613,17 @@ export function detectCrashLoops(sql: SqlStorage): void { WHERE failed_bead.${beads.columns.bead_id} = be.bead_id AND failed_bead.${beads.columns.labels} LIKE ? ) + AND NOT EXISTS ( + SELECT 1 FROM ${agent_metadata} + INNER JOIN ${beads} AS hooked + ON ${agent_metadata.current_hook_bead_id} = hooked.${beads.columns.bead_id} + WHERE ${agent_metadata.bead_id} = be.agent_id + AND hooked.${beads.columns.labels} LIKE ? + ) GROUP BY be.agent_id HAVING fail_count >= ? `, - [windowCutoff, TRIAGE_LABEL_ANY, CRASH_LOOP_THRESHOLD] + [windowCutoff, TRIAGE_LABEL_ANY, TRIAGE_LABEL_ANY, CRASH_LOOP_THRESHOLD] ), ]);