diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index 4f34fde7c6..0db3741993 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -2464,8 +2464,13 @@ export class TownDO extends DurableObject { if (pendingCount === 0) return; // Check if a triage batch bead is already in progress (meaning a - // triage agent is working). We can't filter by role since triage - // uses polecat role; instead check for an open gt:triage batch bead. + // triage agent is working), or recently failed (cooldown to prevent + // rapid retry loops). Skip dispatch in either case. + const triageBatchLike = patrol.TRIAGE_LABEL_LIKE.replace( + patrol.TRIAGE_REQUEST_LABEL, + patrol.TRIAGE_BATCH_LABEL + ); + const cooldownCutoff = new Date(Date.now() - DISPATCH_COOLDOWN_MS).toISOString(); const existingBatch = [ ...query( this.sql, @@ -2473,16 +2478,19 @@ export class TownDO extends DurableObject { SELECT ${beads.bead_id} FROM ${beads} WHERE ${beads.type} = 'issue' AND ${beads.labels} LIKE ? - AND ${beads.status} IN ('open', 'in_progress') AND ${beads.created_by} = 'patrol' + AND ( + ${beads.status} IN ('open', 'in_progress') + OR (${beads.status} = 'failed' AND ${beads.updated_at} > ?) + ) LIMIT 1 `, - [patrol.TRIAGE_LABEL_LIKE.replace(patrol.TRIAGE_REQUEST_LABEL, patrol.TRIAGE_BATCH_LABEL)] + [triageBatchLike, cooldownCutoff] ), ]; if (existingBatch.length > 0) { console.log( - `${TOWN_LOG} maybeDispatchTriageAgent: triage agent already working, skipping (${pendingCount} pending)` + `${TOWN_LOG} maybeDispatchTriageAgent: triage batch bead active or in cooldown, skipping (${pendingCount} pending)` ); return; } @@ -2555,19 +2563,10 @@ export class TownDO extends DurableObject { agents.updateAgentStatus(this.sql, triageAgent.id, 'working'); } else { agents.unhookBead(this.sql, triageAgent.id); + // Failing the batch bead triggers cooldown: the guard at the top of + // this method skips dispatch while a failed batch bead's updated_at + // is within DISPATCH_COOLDOWN_MS. beadOps.updateBeadStatus(this.sql, triageBead.bead_id, 'failed', triageAgent.id); - // Apply dispatch cooldown so the next alarm tick doesn't immediately - // retry. Setting last_activity_at = now() makes the agent invisible - // to schedulePendingWork for DISPATCH_COOLDOWN_MS (2 min). - query( - this.sql, - /* sql */ ` - UPDATE ${agent_metadata} - SET ${agent_metadata.columns.last_activity_at} = ? - WHERE ${agent_metadata.bead_id} = ? - `, - [now(), triageAgent.id] - ); console.error(`${TOWN_LOG} maybeDispatchTriageAgent: triage agent failed to start`); } } diff --git a/cloudflare-gastown/src/dos/town/patrol.ts b/cloudflare-gastown/src/dos/town/patrol.ts index e8cc2de374..b7e6298a4a 100644 --- a/cloudflare-gastown/src/dos/town/patrol.ts +++ b/cloudflare-gastown/src/dos/town/patrol.ts @@ -107,26 +107,31 @@ export function createTriageRequest( if (existing.length > 0) return; } - // Global cap: skip if there are already too many open triage requests. - // Prevents unbounded accumulation during feedback loops. - const openCountRows = [ - ...query( - sql, - /* sql */ ` - SELECT COUNT(*) AS cnt FROM ${beads} - WHERE ${beads.type} = 'issue' - AND ${beads.labels} LIKE ? - AND ${beads.status} = 'open' - `, - [TRIAGE_LABEL_LIKE] - ), - ]; - const openCount = Number(z.object({ cnt: z.number() }).parse(openCountRows[0]).cnt); - if (openCount >= MAX_OPEN_TRIAGE_REQUESTS) { - console.warn( - `${LOG} createTriageRequest: global cap reached (${openCount} open), skipping type=${params.triageType}` - ); - return; + // Global cap: skip if there are already too many open *automatic* triage + // requests (patrol-generated). Escalations are exempt from both the gate + // and the count — they are agent/user initiated and silently dropping + // them would leave the escalation bead with no automated follow-up. + if (params.triageType !== 'escalation') { + const openCountRows = [ + ...query( + sql, + /* sql */ ` + SELECT COUNT(*) AS cnt FROM ${beads} + WHERE ${beads.type} = 'issue' + AND ${beads.labels} LIKE ? + AND ${beads.status} = 'open' + AND json_extract(${beads.metadata}, '$.triage_type') != 'escalation' + `, + [TRIAGE_LABEL_LIKE] + ), + ]; + const openCount = Number(z.object({ cnt: z.number() }).parse(openCountRows[0]).cnt); + if (openCount >= MAX_OPEN_TRIAGE_REQUESTS) { + console.warn( + `${LOG} createTriageRequest: global cap reached (${openCount} open), skipping type=${params.triageType}` + ); + return; + } } const metadata: TriageRequestMetadata = { @@ -585,8 +590,12 @@ export function detectCrashLoops(sql: SqlStorage): void { // Exclude triage agents from crash loop detection — their failures must // not create new triage requests, which would feed the feedback loop. - // An agent is considered a triage agent if its current hooked bead has - // the gt:triage or gt:triage-request label (both start with "gt:triage"). + // Two complementary checks: + // 1. The failed bead itself carries a triage label (covers triage batch + // bead failures, stable after unhook clears current_hook_bead_id). + // 2. The agent is currently hooked to a triage-labeled bead (covers + // resolveTriage actions like CLOSE_BEAD that fail ordinary beads + // while the triage agent is still working its batch). const TRIAGE_LABEL_ANY = `%"gt:triage%`; const rows = CrashRow.array().parse([ @@ -599,6 +608,11 @@ export function detectCrashLoops(sql: SqlStorage): void { AND be.new_value = 'failed' AND be.agent_id IS NOT NULL AND be.created_at > ? + AND NOT EXISTS ( + SELECT 1 FROM ${beads} AS failed_bead + WHERE failed_bead.${beads.columns.bead_id} = be.bead_id + AND failed_bead.${beads.columns.labels} LIKE ? + ) AND NOT EXISTS ( SELECT 1 FROM ${agent_metadata} INNER JOIN ${beads} AS hooked @@ -609,7 +623,7 @@ export function detectCrashLoops(sql: SqlStorage): void { GROUP BY be.agent_id HAVING fail_count >= ? `, - [windowCutoff, TRIAGE_LABEL_ANY, CRASH_LOOP_THRESHOLD] + [windowCutoff, TRIAGE_LABEL_ANY, TRIAGE_LABEL_ANY, CRASH_LOOP_THRESHOLD] ), ]); diff --git a/cloudflare-gastown/src/types.ts b/cloudflare-gastown/src/types.ts index 5cd6c6d6fb..a775283462 100644 --- a/cloudflare-gastown/src/types.ts +++ b/cloudflare-gastown/src/types.ts @@ -48,7 +48,7 @@ export type BeadFilter = { // -- Agents (now beads + agent_metadata) -- -export const AgentRole = z.enum(['polecat', 'refinery', 'mayor', 'triage']); +export const AgentRole = z.enum(['polecat', 'refinery', 'mayor']); export type AgentRole = z.infer; export const AgentStatus = z.enum(['idle', 'working', 'stalled', 'dead']);