From 6b0a7a9eecc7dd89bb5cfdc03b5c9097292c98f5 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Wed, 18 Mar 2026 17:19:25 -0500 Subject: [PATCH 01/47] fix(gastown): route MR bead failures through full review lifecycle to unblock convoys Replace direct completeReview() calls in processReviewQueue() failure paths with completeReviewWithResult(), which properly updates convoy progress and returns source beads to in_progress for rework dispatch. Also adds recovery for orphaned source beads stuck in in_review and fixes bead status rollback consistency in dispatchAgent. --- cloudflare-gastown/src/dos/Town.do.ts | 70 +++++- .../src/dos/town/review-queue.ts | 48 ++++ .../test/integration/review-failure.test.ts | 222 ++++++++++++++++++ 3 files changed, 336 insertions(+), 4 deletions(-) create mode 100644 cloudflare-gastown/test/integration/review-failure.test.ts diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index d73cfdded9..b696eec557 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -3040,7 +3040,8 @@ export class TownDO extends DurableObject { role: agent.role, }); } else { - // Container failed to start — roll back to idle + // Container failed to start — roll back agent to idle and bead + // to open so schedulePendingWork can retry on the next alarm. query( this.sql, /* sql */ ` @@ -3050,6 +3051,9 @@ export class TownDO extends DurableObject { `, [agent.id] ); + if (agent.current_hook_bead_id) { + beadOps.updateBeadStatus(this.sql, agent.current_hook_bead_id, 'open', agent.id); + } this.emitEvent({ event: 'agent.dispatch_failed', townId: this.townId, @@ -3504,6 +3508,7 @@ export class TownDO extends DurableObject { private async processReviewQueue(): Promise { reviewQueue.recoverStuckReviews(this.sql); reviewQueue.closeOrphanedReviewBeads(this.sql); + reviewQueue.recoverOrphanedSourceBeads(this.sql); // Poll open PRs created by the 'pr' strategy await this.pollPendingPRs(); @@ -3516,12 +3521,15 @@ export class TownDO extends DurableObject { const rigId = entry.rig_id; if (!rigId) { console.error(`${TOWN_LOG} processReviewQueue: entry ${entry.id} has no rig_id, skipping`); - reviewQueue.completeReview(this.sql, entry.id, 'failed'); + this.failReviewWithRework(entry, 'MR bead has no rig_id'); return; } const rigConfig = await this.getRigConfig(rigId); if (!rigConfig) { - reviewQueue.completeReview(this.sql, entry.id, 'failed'); + console.error( + `${TOWN_LOG} processReviewQueue: no rig config for rig=${rigId}, entry=${entry.id}` + ); + this.failReviewWithRework(entry, `No rig config found for rig ${rigId}`); return; } @@ -3638,7 +3646,61 @@ export class TownDO extends DurableObject { console.error( `${TOWN_LOG} processReviewQueue: refinery agent failed to start for entry=${entry.id}` ); - reviewQueue.completeReview(this.sql, entry.id, 'failed'); + this.failReviewWithRework(entry, 'Refinery container failed to start'); + } + } + + /** + * Fail an MR bead via the full review lifecycle (completeReviewWithResult) + * so that convoy progress is updated and the source bead is returned to + * in_progress for rework. Mirrors the rework dispatch in + * completeReviewWithResult and agentCompleted. + * + * Used by processReviewQueue failure paths that previously called + * completeReview directly — which bypassed convoy progress and left the + * source bead stuck in in_review. + */ + private failReviewWithRework(entry: ReviewQueueEntry, reason: string): void { + reviewQueue.completeReviewWithResult(this.sql, { + entry_id: entry.id, + status: 'failed', + message: reason, + }); + + this.emitEvent({ + event: 'review.failed', + townId: this.townId, + beadId: entry.id, + }); + + // The source bead was returned to in_progress by completeReviewWithResult. + // Attempt to dispatch a polecat for rework (same pattern as the public + // completeReviewWithResult method). + const sourceBeadId = entry.bead_id; + if (sourceBeadId && sourceBeadId !== entry.id) { + const sourceBead = beadOps.getBead(this.sql, sourceBeadId); + if (sourceBead?.rig_id) { + try { + const reworkAgent = agents.getOrCreateAgent( + this.sql, + 'polecat', + sourceBead.rig_id, + this.townId + ); + agents.hookBead(this.sql, reworkAgent.id, sourceBeadId); + this.dispatchAgent(reworkAgent, sourceBead).catch(err => + console.error( + `${TOWN_LOG} failReviewWithRework: rework dispatch failed for bead=${sourceBeadId}`, + err + ) + ); + } catch (err) { + console.warn( + `${TOWN_LOG} failReviewWithRework: could not dispatch rework for bead=${sourceBeadId}:`, + err + ); + } + } } } diff --git a/cloudflare-gastown/src/dos/town/review-queue.ts b/cloudflare-gastown/src/dos/town/review-queue.ts index 26df5a6ddf..d56a0bbb20 100644 --- a/cloudflare-gastown/src/dos/town/review-queue.ts +++ b/cloudflare-gastown/src/dos/town/review-queue.ts @@ -472,6 +472,54 @@ export function closeOrphanedReviewBeads(sql: SqlStorage): void { } } +/** + * Recover source beads stuck in 'in_review' whose MR bead has already + * reached a terminal state (closed/failed). This can happen when an MR + * bead failure path bypasses completeReviewWithResult (which is the only + * path that returns the source bead to in_progress for rework). + * + * Only affects source beads that have been stuck for longer than the + * recovery timeout, to avoid interfering with in-flight reviews. + */ +export function recoverOrphanedSourceBeads(sql: SqlStorage): void { + const cutoff = new Date(Date.now() - REVIEW_RUNNING_TIMEOUT_MS).toISOString(); + + const stuckRows = [ + ...query( + sql, + /* sql */ ` + SELECT src.${beads.columns.bead_id} AS source_bead_id + FROM ${beads} AS src + INNER JOIN ${bead_dependencies} AS dep + ON dep.${bead_dependencies.columns.depends_on_bead_id} = src.${beads.columns.bead_id} + AND dep.${bead_dependencies.columns.dependency_type} = 'tracks' + INNER JOIN ${beads} AS mr + ON mr.${beads.columns.bead_id} = dep.${bead_dependencies.columns.bead_id} + AND mr.${beads.columns.type} = 'merge_request' + WHERE src.${beads.columns.status} = 'in_review' + AND src.${beads.columns.updated_at} < ? + AND mr.${beads.columns.status} IN ('closed', 'failed') + `, + [cutoff] + ), + ]; + + for (const row of stuckRows) { + const parsed = z.object({ source_bead_id: z.string() }).parse(row); + try { + updateBeadStatus(sql, parsed.source_bead_id, 'in_progress', 'system'); + console.log( + `[review-queue] recoverOrphanedSourceBeads: returned bead=${parsed.source_bead_id} to in_progress` + ); + } catch (err) { + console.warn( + `[review-queue] recoverOrphanedSourceBeads: failed to recover bead=${parsed.source_bead_id}`, + err + ); + } + } +} + // ── Agent Done ────────────────────────────────────────────────────── export function agentDone(sql: SqlStorage, agentId: string, input: AgentDoneInput): void { diff --git a/cloudflare-gastown/test/integration/review-failure.test.ts b/cloudflare-gastown/test/integration/review-failure.test.ts new file mode 100644 index 0000000000..1c17b6ec43 --- /dev/null +++ b/cloudflare-gastown/test/integration/review-failure.test.ts @@ -0,0 +1,222 @@ +import { env } from 'cloudflare:test'; +import { describe, it, expect, beforeEach } from 'vitest'; + +function getTownStub(name = 'test-town') { + const id = env.TOWN.idFromName(name); + return env.TOWN.get(id); +} + +describe('Review failure paths — convoy progress and source bead recovery', () => { + let town: ReturnType; + + beforeEach(() => { + town = getTownStub(`review-failure-${crypto.randomUUID()}`); + }); + + async function setupConvoyWithMR() { + await town.addRig({ + rigId: 'rig-1', + name: 'main-rig', + gitUrl: 'https://github.com/test/repo.git', + defaultBranch: 'main', + }); + + const result = await town.slingConvoy({ + rigId: 'rig-1', + convoyTitle: 'Review Failure Test', + tasks: [{ title: 'Task 1' }], + }); + + const beadId = result.beads[0].bead.bead_id; + const agentId = result.beads[0].agent.id; + + // Simulate agent completing work — creates an MR bead in review queue + await town.agentDone(agentId, { + branch: 'gt/polecat/test-branch', + summary: 'Completed task', + }); + + // Source bead should now be in_review (waiting for refinery) + const sourceBead = await town.getBeadAsync(beadId); + expect(sourceBead?.status).toBe('in_review'); + + // Find the MR bead + const allBeads = await town.listBeads({ type: 'merge_request' }); + const mrBead = allBeads.find(b => b.metadata?.source_bead_id === beadId); + expect(mrBead).toBeTruthy(); + + return { result, beadId, agentId, mrBeadId: mrBead!.bead_id, convoyId: result.convoy.id }; + } + + // ── completeReviewWithResult properly updates convoy progress ─────── + + describe('completeReviewWithResult on MR failure', () => { + it('should return source bead to in_progress when MR bead fails', async () => { + const { beadId, mrBeadId } = await setupConvoyWithMR(); + + // Fail the review via completeReviewWithResult (the fixed path) + await town.completeReviewWithResult({ + entry_id: mrBeadId, + status: 'failed', + message: 'Refinery container failed to start', + }); + + // MR bead should be failed + const mrBead = await town.getBeadAsync(mrBeadId); + expect(mrBead?.status).toBe('failed'); + + // Source bead should be returned to in_progress (not stuck in in_review) + const sourceBead = await town.getBeadAsync(beadId); + expect(sourceBead?.status).toBe('in_progress'); + }); + + it('should update convoy progress when MR bead is merged', async () => { + const { beadId, mrBeadId, convoyId } = await setupConvoyWithMR(); + + // Complete the review successfully + await town.completeReviewWithResult({ + entry_id: mrBeadId, + status: 'merged', + message: 'Merged by refinery', + }); + + // Source bead should be closed + const sourceBead = await town.getBeadAsync(beadId); + expect(sourceBead?.status).toBe('closed'); + + // MR bead should be closed + const mrBead = await town.getBeadAsync(mrBeadId); + expect(mrBead?.status).toBe('closed'); + + // Convoy progress should reflect the closed bead + const convoyStatus = await town.getConvoyStatus(convoyId); + expect(convoyStatus?.closed_beads).toBe(1); + }); + }); + + // ── Multi-bead convoy: failed MR doesn't stall the convoy ────────── + + describe('convoy progress with mixed outcomes', () => { + it('should not stall convoy when one MR fails and another merges', async () => { + await town.addRig({ + rigId: 'rig-1', + name: 'main-rig', + gitUrl: 'https://github.com/test/repo.git', + defaultBranch: 'main', + }); + + const result = await town.slingConvoy({ + rigId: 'rig-1', + convoyTitle: 'Two-Task Convoy', + tasks: [{ title: 'Task 1' }, { title: 'Task 2' }], + }); + + const bead0Id = result.beads[0].bead.bead_id; + const agent0Id = result.beads[0].agent.id; + const bead1Id = result.beads[1].bead.bead_id; + const agent1Id = result.beads[1].agent.id; + + // Both agents complete work + await town.agentDone(agent0Id, { + branch: 'gt/polecat/task-1', + summary: 'Task 1 done', + }); + await town.agentDone(agent1Id, { + branch: 'gt/polecat/task-2', + summary: 'Task 2 done', + }); + + // Find MR beads + const mrBeads = await town.listBeads({ type: 'merge_request' }); + const mr0 = mrBeads.find(b => b.metadata?.source_bead_id === bead0Id); + const mr1 = mrBeads.find(b => b.metadata?.source_bead_id === bead1Id); + expect(mr0).toBeTruthy(); + expect(mr1).toBeTruthy(); + + // Fail MR for task 1 via completeReviewWithResult + await town.completeReviewWithResult({ + entry_id: mr0!.bead_id, + status: 'failed', + message: 'Review failed', + }); + + // Source bead 0 should be back to in_progress (ready for rework) + const source0 = await town.getBeadAsync(bead0Id); + expect(source0?.status).toBe('in_progress'); + + // Merge MR for task 2 + await town.completeReviewWithResult({ + entry_id: mr1!.bead_id, + status: 'merged', + message: 'Merged', + }); + + // Source bead 1 should be closed + const source1 = await town.getBeadAsync(bead1Id); + expect(source1?.status).toBe('closed'); + + // Convoy should show 1 closed bead (task 2 merged; task 1 is in_progress + // awaiting rework, its MR is failed but the source isn't terminal yet) + const convoyStatus = await town.getConvoyStatus(result.convoy.id); + expect(convoyStatus?.closed_beads).toBe(1); + }); + }); + + // ── Direct completeReview leaves source bead orphaned (regression) ─ + + describe('completeReview bypass (regression guard)', () => { + it('should leave source bead stuck in in_review when completeReview is called directly', async () => { + const { beadId, mrBeadId } = await setupConvoyWithMR(); + + // Call completeReview directly (the OLD broken path) — + // this is the raw SQL update that bypasses lifecycle events. + // We use this to verify the regression scenario. + await town.completeReview(mrBeadId, 'failed'); + + // MR bead should be failed + const mrBead = await town.getBeadAsync(mrBeadId); + expect(mrBead?.status).toBe('failed'); + + // Source bead is STILL in_review — this is the bug this PR fixes + // in processReviewQueue. The direct completeReview call doesn't + // return the source bead to in_progress. + const sourceBead = await town.getBeadAsync(beadId); + expect(sourceBead?.status).toBe('in_review'); + }); + }); + + // ── Source bead in_review after agentDone ────────────────────────── + + describe('agentDone transitions source bead to in_review', () => { + it('should set source bead to in_review after polecat calls agentDone', async () => { + await town.addRig({ + rigId: 'rig-1', + name: 'main-rig', + gitUrl: 'https://github.com/test/repo.git', + defaultBranch: 'main', + }); + + const result = await town.slingConvoy({ + rigId: 'rig-1', + convoyTitle: 'Agent Done Test', + tasks: [{ title: 'Single Task' }], + }); + + const beadId = result.beads[0].bead.bead_id; + const agentId = result.beads[0].agent.id; + + await town.agentDone(agentId, { + branch: 'gt/polecat/test', + summary: 'Done', + }); + + const bead = await town.getBeadAsync(beadId); + expect(bead?.status).toBe('in_review'); + + // An MR bead should have been created + const mrBeads = await town.listBeads({ type: 'merge_request' }); + expect(mrBeads.length).toBeGreaterThan(0); + expect(mrBeads.some(b => b.metadata?.source_bead_id === beadId)).toBe(true); + }); + }); +}); From e332f0dc2ba45db28e0f8afac0425c15e392d3ad Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Wed, 18 Mar 2026 17:28:31 -0500 Subject: [PATCH 02/47] =?UTF-8?q?fix:=20address=20PR=20review=20=E2=80=94?= =?UTF-8?q?=20exclude=20pending=20MRs=20from=20orphan=20recovery,=20use=20?= =?UTF-8?q?open=20status?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add NOT EXISTS subquery to recoverOrphanedSourceBeads to skip source beads that still have a pending (open/in_progress) MR child, preventing false recovery during rework cycles. - Change recovery target status from in_progress to open so the scheduler can assign and dispatch a fresh polecat (the original agent was already unhooked by agentDone). --- .../src/dos/town/review-queue.ts | 26 ++++++++++++++----- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/cloudflare-gastown/src/dos/town/review-queue.ts b/cloudflare-gastown/src/dos/town/review-queue.ts index d56a0bbb20..8150a21185 100644 --- a/cloudflare-gastown/src/dos/town/review-queue.ts +++ b/cloudflare-gastown/src/dos/town/review-queue.ts @@ -473,10 +473,15 @@ export function closeOrphanedReviewBeads(sql: SqlStorage): void { } /** - * Recover source beads stuck in 'in_review' whose MR bead has already - * reached a terminal state (closed/failed). This can happen when an MR - * bead failure path bypasses completeReviewWithResult (which is the only - * path that returns the source bead to in_progress for rework). + * Recover source beads stuck in 'in_review' whose MR beads have all + * reached a terminal state (closed/failed) with no pending review still + * in flight. This can happen when an MR bead failure path bypasses + * completeReviewWithResult (which is the only path that returns the + * source bead to in_progress for rework). + * + * Returns beads to 'open' (not 'in_progress') so the scheduler can + * assign and dispatch a fresh polecat — by this point the original + * agent has already been unhooked. * * Only affects source beads that have been stuck for longer than the * recovery timeout, to avoid interfering with in-flight reviews. @@ -499,6 +504,15 @@ export function recoverOrphanedSourceBeads(sql: SqlStorage): void { WHERE src.${beads.columns.status} = 'in_review' AND src.${beads.columns.updated_at} < ? AND mr.${beads.columns.status} IN ('closed', 'failed') + AND NOT EXISTS ( + SELECT 1 FROM ${bead_dependencies} AS dep2 + INNER JOIN ${beads} AS mr2 + ON mr2.${beads.columns.bead_id} = dep2.${bead_dependencies.columns.bead_id} + WHERE dep2.${bead_dependencies.columns.depends_on_bead_id} = src.${beads.columns.bead_id} + AND dep2.${bead_dependencies.columns.dependency_type} = 'tracks' + AND mr2.${beads.columns.type} = 'merge_request' + AND mr2.${beads.columns.status} IN ('open', 'in_progress') + ) `, [cutoff] ), @@ -507,9 +521,9 @@ export function recoverOrphanedSourceBeads(sql: SqlStorage): void { for (const row of stuckRows) { const parsed = z.object({ source_bead_id: z.string() }).parse(row); try { - updateBeadStatus(sql, parsed.source_bead_id, 'in_progress', 'system'); + updateBeadStatus(sql, parsed.source_bead_id, 'open', 'system'); console.log( - `[review-queue] recoverOrphanedSourceBeads: returned bead=${parsed.source_bead_id} to in_progress` + `[review-queue] recoverOrphanedSourceBeads: returned bead=${parsed.source_bead_id} to open` ); } catch (err) { console.warn( From 2a1278b6e8d1b3b8ec250d0fefbedbfaacf79446 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Wed, 18 Mar 2026 20:24:06 -0500 Subject: [PATCH 03/47] refactor(gastown): remove superfluous ensureInitialized calls from Town DO blockConcurrencyWhile in the constructor already guarantees initialization completes before any RPC method executes. The 64 redundant await this.ensureInitialized() calls in public methods were all no-ops (just checking an already-resolved promise). --- cloudflare-gastown/src/dos/Town.do.ts | 77 --------------------------- cloudflare-gastown/wrangler.jsonc | 16 +++++- 2 files changed, 15 insertions(+), 78 deletions(-) diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index b696eec557..c9a2521196 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -252,7 +252,6 @@ export class TownDO extends DurableObject { url.pathname.endsWith('/status/ws') && request.headers.get('Upgrade')?.toLowerCase() === 'websocket' ) { - await this.ensureInitialized(); const pair = new WebSocketPair(); const [client, server] = [pair[0], pair[1]]; this.ctx.acceptWebSocket(server, ['status']); @@ -393,7 +392,6 @@ export class TownDO extends DurableObject { * Called by the mayor via the /mayor/ui-action HTTP route. */ async broadcastUiAction(action: UiAction): Promise { - await this.ensureInitialized(); const sockets = this.ctx.getWebSockets('status'); if (sockets.length === 0) return; const frame = JSON.stringify({ channel: 'ui_action', action, ts: now() }); @@ -557,12 +555,10 @@ export class TownDO extends DurableObject { gitUrl: string; defaultBranch: string; }): Promise { - await this.ensureInitialized(); return rigs.addRig(this.sql, input); } async removeRig(rigId: string): Promise { - await this.ensureInitialized(); rigs.removeRig(this.sql, rigId); await this.ctx.storage.delete(`rig:${rigId}:config`); // Delete all beads belonging to this rig (cascades to satellite tables via deleteBead) @@ -581,12 +577,10 @@ export class TownDO extends DurableObject { } async listRigs(): Promise { - await this.ensureInitialized(); return rigs.listRigs(this.sql); } async getRigAsync(rigId: string): Promise { - await this.ensureInitialized(); return rigs.getRig(this.sql, rigId); } @@ -696,7 +690,6 @@ export class TownDO extends DurableObject { // ══════════════════════════════════════════════════════════════════ async createBead(input: CreateBeadInput): Promise { - await this.ensureInitialized(); const bead = beadOps.createBead(this.sql, input); this.emitEvent({ event: 'bead.created', @@ -716,17 +709,14 @@ export class TownDO extends DurableObject { } async getBeadAsync(beadId: string): Promise { - await this.ensureInitialized(); return beadOps.getBead(this.sql, beadId); } async listBeads(filter: BeadFilter): Promise { - await this.ensureInitialized(); return beadOps.listBeads(this.sql, filter); } async updateBeadStatus(beadId: string, status: string, agentId: string): Promise { - await this.ensureInitialized(); // Convoy progress is updated automatically inside beadOps.updateBeadStatus // when the bead reaches a terminal status (closed/failed). const bead = beadOps.updateBeadStatus(this.sql, beadId, status, agentId); @@ -792,7 +782,6 @@ export class TownDO extends DurableObject { } async deleteBead(beadId: string): Promise { - await this.ensureInitialized(); beadOps.deleteBead(this.sql, beadId); } @@ -801,7 +790,6 @@ export class TownDO extends DurableObject { since?: string; limit?: number; }): Promise { - await this.ensureInitialized(); return beadOps.listBeadEvents(this.sql, options); } @@ -822,7 +810,6 @@ export class TownDO extends DurableObject { }>, actorId: string ): Promise { - await this.ensureInitialized(); const bead = beadOps.updateBeadFields(this.sql, beadId, fields, actorId); // When a bead closes via field update, check for newly unblocked beads @@ -839,8 +826,6 @@ export class TownDO extends DurableObject { * Writes a bead_event for auditability. */ async resetAgent(agentId: string): Promise { - await this.ensureInitialized(); - const agent = agents.getAgent(this.sql, agentId); if (!agent) throw new Error(`Agent ${agentId} not found`); @@ -879,8 +864,6 @@ export class TownDO extends DurableObject { convoyId: string, fields: Partial<{ merge_mode: ConvoyMergeMode; feature_branch: string }> ): Promise { - await this.ensureInitialized(); - const convoy = this.getConvoy(convoyId); if (!convoy) return null; @@ -925,32 +908,26 @@ export class TownDO extends DurableObject { // ══════════════════════════════════════════════════════════════════ async registerAgent(input: RegisterAgentInput): Promise { - await this.ensureInitialized(); return agents.registerAgent(this.sql, input); } async getAgentAsync(agentId: string): Promise { - await this.ensureInitialized(); return agents.getAgent(this.sql, agentId); } async getAgentByIdentity(identity: string): Promise { - await this.ensureInitialized(); return agents.getAgentByIdentity(this.sql, identity); } async listAgents(filter?: AgentFilter): Promise { - await this.ensureInitialized(); return agents.listAgents(this.sql, filter); } async updateAgentStatus(agentId: string, status: string): Promise { - await this.ensureInitialized(); agents.updateAgentStatus(this.sql, agentId, status); } async deleteAgent(agentId: string): Promise { - await this.ensureInitialized(); agents.deleteAgent(this.sql, agentId); try { const agentDO = getAgentDOStub(this.env, agentId); @@ -961,23 +938,19 @@ export class TownDO extends DurableObject { } async hookBead(agentId: string, beadId: string): Promise { - await this.ensureInitialized(); agents.hookBead(this.sql, agentId, beadId); await this.armAlarmIfNeeded(); } async unhookBead(agentId: string): Promise { - await this.ensureInitialized(); agents.unhookBead(this.sql, agentId); } async getHookedBead(agentId: string): Promise { - await this.ensureInitialized(); return agents.getHookedBead(this.sql, agentId); } async getOrCreateAgent(role: AgentRole, rigId: string): Promise { - await this.ensureInitialized(); return agents.getOrCreateAgent(this.sql, role, rigId, this.townId); } @@ -996,30 +969,25 @@ export class TownDO extends DurableObject { // ── Prime & Checkpoint ──────────────────────────────────────────── async prime(agentId: string): Promise { - await this.ensureInitialized(); return agents.prime(this.sql, agentId); } async writeCheckpoint(agentId: string, data: unknown): Promise { - await this.ensureInitialized(); agents.writeCheckpoint(this.sql, agentId, data); } async readCheckpoint(agentId: string): Promise { - await this.ensureInitialized(); return agents.readCheckpoint(this.sql, agentId); } // ── Heartbeat ───────────────────────────────────────────────────── async touchAgentHeartbeat(agentId: string): Promise { - await this.ensureInitialized(); agents.touchAgent(this.sql, agentId); await this.armAlarmIfNeeded(); } async updateAgentStatusMessage(agentId: string, message: string): Promise { - await this.ensureInitialized(); agents.updateAgentStatusMessage(this.sql, agentId, message); const agent = agents.getAgent(this.sql, agentId); if (agent?.current_hook_bead_id) { @@ -1046,12 +1014,10 @@ export class TownDO extends DurableObject { // ══════════════════════════════════════════════════════════════════ async sendMail(input: SendMailInput): Promise { - await this.ensureInitialized(); mail.sendMail(this.sql, input); } async checkMail(agentId: string): Promise { - await this.ensureInitialized(); return mail.checkMail(this.sql, agentId); } @@ -1074,8 +1040,6 @@ export class TownDO extends DurableObject { ttlSeconds?: number; } ): Promise { - await this.ensureInitialized(); - const nudgeId = crypto.randomUUID(); const mode = options?.mode ?? 'wait-idle'; const priority = options?.priority ?? 'normal'; @@ -1143,8 +1107,6 @@ export class TownDO extends DurableObject { ): Promise< { nudge_id: string; message: string; mode: string; priority: string; source: string }[] > { - await this.ensureInitialized(); - const rows = [ ...query( this.sql, @@ -1180,8 +1142,6 @@ export class TownDO extends DurableObject { /** Mark a nudge as delivered. */ async markNudgeDelivered(nudgeId: string): Promise { - await this.ensureInitialized(); - query( this.sql, /* sql */ ` @@ -1198,8 +1158,6 @@ export class TownDO extends DurableObject { * Called from the alarm loop. Returns the count of nudges expired. */ async expireStaleNudges(): Promise { - await this.ensureInitialized(); - const result = [ ...query( this.sql, @@ -1223,7 +1181,6 @@ export class TownDO extends DurableObject { // ══════════════════════════════════════════════════════════════════ async submitToReviewQueue(input: ReviewQueueInput): Promise { - await this.ensureInitialized(); reviewQueue.submitToReviewQueue(this.sql, input); this.emitEvent({ event: 'review.submitted', @@ -1235,12 +1192,10 @@ export class TownDO extends DurableObject { } async popReviewQueue(): Promise { - await this.ensureInitialized(); return reviewQueue.popReviewQueue(this.sql); } async completeReview(entryId: string, status: 'merged' | 'failed'): Promise { - await this.ensureInitialized(); reviewQueue.completeReview(this.sql, entryId, status); } @@ -1250,8 +1205,6 @@ export class TownDO extends DurableObject { message?: string; commit_sha?: string; }): Promise { - await this.ensureInitialized(); - // Resolve the source bead ID before completing the review, so we can // trigger dispatchUnblockedBeads for it after the MR closes. const mrBead = beadOps.getBead(this.sql, input.entry_id); @@ -1312,7 +1265,6 @@ export class TownDO extends DurableObject { } async agentDone(agentId: string, input: AgentDoneInput): Promise { - await this.ensureInitialized(); reviewQueue.agentDone(this.sql, agentId, input); await this.armAlarmIfNeeded(); } @@ -1321,7 +1273,6 @@ export class TownDO extends DurableObject { agentId: string, input: { status: 'completed' | 'failed'; reason?: string } ): Promise { - await this.ensureInitialized(); let resolvedAgentId = agentId; if (!resolvedAgentId) { const mayor = agents.listAgents(this.sql, { role: 'mayor' })[0]; @@ -1379,7 +1330,6 @@ export class TownDO extends DurableObject { action: string; resolution_notes: string; }): Promise { - await this.ensureInitialized(); const triageBead = beadOps.getBead(this.sql, input.triage_request_bead_id); if (!triageBead) throw new Error(`Triage request bead ${input.triage_request_bead_id} not found`); @@ -1605,19 +1555,16 @@ export class TownDO extends DurableObject { } async createMolecule(beadId: string, formula: unknown): Promise { - await this.ensureInitialized(); return reviewQueue.createMolecule(this.sql, beadId, formula); } async getMoleculeCurrentStep( agentId: string ): Promise<{ molecule: Molecule; step: unknown } | null> { - await this.ensureInitialized(); return reviewQueue.getMoleculeCurrentStep(this.sql, agentId); } async advanceMoleculeStep(agentId: string, summary: string): Promise { - await this.ensureInitialized(); return reviewQueue.advanceMoleculeStep(this.sql, agentId, summary); } @@ -1632,8 +1579,6 @@ export class TownDO extends DurableObject { priority?: string; metadata?: Record; }): Promise<{ bead: Bead; agent: Agent }> { - await this.ensureInitialized(); - const createdBead = beadOps.createBead(this.sql, { type: 'issue', title: input.title, @@ -1686,7 +1631,6 @@ export class TownDO extends DurableObject { _model?: string, uiContext?: string ): Promise<{ agentId: string; sessionStatus: 'idle' | 'active' | 'starting' }> { - await this.ensureInitialized(); const townId = this.townId; let mayor = agents.listAgents(this.sql, { role: 'mayor' })[0] ?? null; @@ -1771,7 +1715,6 @@ export class TownDO extends DurableObject { * without requiring the user to send a message first. */ async ensureMayor(): Promise<{ agentId: string; sessionStatus: 'idle' | 'active' | 'starting' }> { - await this.ensureInitialized(); const townId = this.townId; let mayor = agents.listAgents(this.sql, { role: 'mayor' })[0] ?? null; @@ -1855,7 +1798,6 @@ export class TownDO extends DurableObject { lastActivityAt: string; } | null; }> { - await this.ensureInitialized(); const mayor = agents.listAgents(this.sql, { role: 'mayor' })[0] ?? null; const mapStatus = (agentStatus: string): 'idle' | 'active' | 'starting' => { @@ -1914,7 +1856,6 @@ export class TownDO extends DurableObject { beads: Array<{ bead_id: string; rig_id: string }>; created_by?: string; }): Promise { - await this.ensureInitialized(); const parsed = z .object({ title: z.string().min(1), @@ -1996,8 +1937,6 @@ export class TownDO extends DurableObject { } async onBeadClosed(input: { convoyId: string; beadId: string }): Promise { - await this.ensureInitialized(); - // Count closed tracked beads const closedRows = [ ...query( @@ -2063,8 +2002,6 @@ export class TownDO extends DurableObject { * still assigned to those beads so they return to the idle pool. */ async closeConvoy(convoyId: string): Promise { - await this.ensureInitialized(); - const convoy = this.getConvoy(convoyId); if (!convoy) return null; @@ -2152,8 +2089,6 @@ export class TownDO extends DurableObject { merge_mode?: 'review-then-land' | 'review-and-merge'; staged?: boolean; }): Promise<{ convoy: ConvoyEntry; beads: Array<{ bead: Bead; agent: Agent | null }> }> { - await this.ensureInitialized(); - // Resolve staged: explicit request wins, otherwise fall back to town config default. const townConfig = await this.getTownConfig(); const isStaged = input.staged ?? townConfig.staged_convoys_default; @@ -2362,8 +2297,6 @@ export class TownDO extends DurableObject { async startConvoy( convoyId: string ): Promise<{ convoy: ConvoyEntry; beads: Array<{ bead: Bead; agent: Agent }> }> { - await this.ensureInitialized(); - const convoy = this.getConvoy(convoyId); if (!convoy) throw new Error(`Convoy not found: ${convoyId}`); if (!convoy.staged) throw new Error(`Convoy is not staged: ${convoyId}`); @@ -2458,7 +2391,6 @@ export class TownDO extends DurableObject { * List active convoys with progress counts. */ async listConvoys(): Promise { - await this.ensureInitialized(); const rows = [ ...query( this.sql, @@ -2492,7 +2424,6 @@ export class TownDO extends DurableObject { } > > { - await this.ensureInitialized(); const convoys = await this.listConvoys(); const detailed = []; for (const convoy of convoys) { @@ -2521,7 +2452,6 @@ export class TownDO extends DurableObject { }) | null > { - await this.ensureInitialized(); const convoy = this.getConvoy(convoyId); if (!convoy) return null; @@ -2579,7 +2509,6 @@ export class TownDO extends DurableObject { // ══════════════════════════════════════════════════════════════════ async acknowledgeEscalation(escalationId: string): Promise { - await this.ensureInitialized(); query( this.sql, /* sql */ ` @@ -2606,7 +2535,6 @@ export class TownDO extends DurableObject { } async listEscalations(filter?: { acknowledged?: boolean }): Promise { - await this.ensureInitialized(); const rows = filter?.acknowledged !== undefined ? [ @@ -2634,7 +2562,6 @@ export class TownDO extends DurableObject { category?: string; message: string; }): Promise { - await this.ensureInitialized(); const beadId = generateId(); const timestamp = now(); @@ -2787,7 +2714,6 @@ export class TownDO extends DurableObject { return; } - await this.ensureInitialized(); const townId = this.townId; console.log(`${TOWN_LOG} alarm: fired for town=${townId}`); @@ -4124,7 +4050,6 @@ export class TownDO extends DurableObject { activeAgents: number; pendingBeads: number; }> { - await this.ensureInitialized(); const townId = this.townId; // Check if alarm is set @@ -4196,8 +4121,6 @@ export class TownDO extends DurableObject { message: string; }>; }> { - await this.ensureInitialized(); - const currentAlarm = await this.ctx.storage.getAlarm(); const active = this.hasActiveWork(); const intervalMs = active ? ACTIVE_ALARM_INTERVAL_MS : IDLE_ALARM_INTERVAL_MS; diff --git a/cloudflare-gastown/wrangler.jsonc b/cloudflare-gastown/wrangler.jsonc index 5644cd45cd..683e4e58d6 100644 --- a/cloudflare-gastown/wrangler.jsonc +++ b/cloudflare-gastown/wrangler.jsonc @@ -5,7 +5,21 @@ "compatibility_date": "2026-02-24", "compatibility_flags": ["nodejs_compat"], "placement": { "mode": "smart" }, - "observability": { "enabled": true }, + "observability": { + "enabled": true, + "head_sampling_rate": 1, + "logs": { + "enabled": true, + "head_sampling_rate": 1, + "persist": true, + "invocation_logs": true, + }, + "traces": { + "enabled": true, + "persist": true, + "head_sampling_rate": 1, + }, + }, "upload_source_maps": true, "version_metadata": { "binding": "CF_VERSION_METADATA" }, "routes": [ From 7258e16325adf997e0d0505d3192dd16c702cded Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Wed, 18 Mar 2026 20:32:01 -0500 Subject: [PATCH 04/47] refactor(gastown): restrict setTownId to town creation paths only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The town ID is persisted to DO storage on creation and loaded during initializeDatabase(). Every subsequent RPC call was redundantly calling setTownId, writing the same value to storage on every request. Removed 21 redundant calls from router.ts, mayor.handler.ts, org-towns.handler.ts, and configureRig — keeping only the 3 creation-time calls. Also removed a misleading comment in alarm() claiming processReviewQueue must run before schedulePendingWork. The ordering is irrelevant: the refinery and polecats operate on different bead types, and the DAG blocker check at dispatch time prevents premature downstream dispatch regardless of scheduling order. --- cloudflare-gastown/src/dos/Town.do.ts | 6 ------ cloudflare-gastown/src/handlers/mayor.handler.ts | 7 ------- cloudflare-gastown/src/handlers/org-towns.handler.ts | 1 - cloudflare-gastown/src/trpc/router.ts | 10 ---------- 4 files changed, 24 deletions(-) diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index c9a2521196..06d5b8851f 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -590,9 +590,6 @@ export class TownDO extends DurableObject { console.log( `${TOWN_LOG} configureRig: rigId=${rigConfig.rigId} hasKilocodeToken=${!!rigConfig.kilocodeToken}` ); - if (rigConfig.townId) { - await this.setTownId(rigConfig.townId); - } await this.ctx.storage.put(`rig:${rigConfig.rigId}:config`, rigConfig); if (rigConfig.kilocodeToken) { @@ -2738,9 +2735,6 @@ export class TownDO extends DurableObject { } } - // Process reviews FIRST so the refinery gets assigned before the - // scheduler dispatches new polecats. This prevents downstream beads - // from starting before upstream reviews are merged. try { await this.processReviewQueue(); } catch (err) { diff --git a/cloudflare-gastown/src/handlers/mayor.handler.ts b/cloudflare-gastown/src/handlers/mayor.handler.ts index 0d6740a0f7..cbe0470450 100644 --- a/cloudflare-gastown/src/handlers/mayor.handler.ts +++ b/cloudflare-gastown/src/handlers/mayor.handler.ts @@ -50,9 +50,6 @@ export async function handleSendMayorMessage(c: Context, params: { t ); const town = getTownDOStub(c.env, params.townId); - // Ensure the TownDO knows its real UUID (ctx.id.name is unreliable in local dev) - // TODO: This should only be done on town creation. Why are we doing it here? - await town.setTownId(params.townId); const result = await town.sendMayorMessage( parsed.data.message, parsed.data.model, @@ -67,7 +64,6 @@ export async function handleSendMayorMessage(c: Context, params: { t */ export async function handleGetMayorStatus(c: Context, params: { townId: string }) { const town = getTownDOStub(c.env, params.townId); - await town.setTownId(params.townId); const status = await town.getMayorStatus(); return c.json(resSuccess(status), 200); } @@ -80,7 +76,6 @@ export async function handleGetMayorStatus(c: Context, params: { tow export async function handleEnsureMayor(c: Context, params: { townId: string }) { console.log(`${MAYOR_HANDLER_LOG} handleEnsureMayor: townId=${params.townId}`); const town = getTownDOStub(c.env, params.townId); - await town.setTownId(params.townId); const result = await town.ensureMayor(); return c.json(resSuccess(result), 200); } @@ -156,7 +151,6 @@ export async function handleSetDashboardContext( } const town = getTownDOStub(c.env, params.townId); - await town.setTownId(params.townId); await town.setDashboardContext(parsed.data.context); return c.json(resSuccess({ stored: true }), 200); } @@ -184,7 +178,6 @@ export async function handleBroadcastUiAction(c: Context, params: { const action = normalizeUiAction(parsed.data.action, params.townId); const town = getTownDOStub(c.env, params.townId); - await town.setTownId(params.townId); // Validate that the referenced rig belongs to this town const rigId = uiActionRigId(action); diff --git a/cloudflare-gastown/src/handlers/org-towns.handler.ts b/cloudflare-gastown/src/handlers/org-towns.handler.ts index 478959671b..a875e8375d 100644 --- a/cloudflare-gastown/src/handlers/org-towns.handler.ts +++ b/cloudflare-gastown/src/handlers/org-towns.handler.ts @@ -100,7 +100,6 @@ export async function handleCreateOrgRig(c: Context, params: { orgId // If this fails, roll back the rig creation to avoid an orphaned record. try { const townDOStub = getTownDOStub(c.env, parsed.data.town_id); - await townDOStub.setTownId(parsed.data.town_id); await townDOStub.configureRig({ rigId: rig.id, townId: parsed.data.town_id, diff --git a/cloudflare-gastown/src/trpc/router.ts b/cloudflare-gastown/src/trpc/router.ts index 45ee0260d0..420b52f69f 100644 --- a/cloudflare-gastown/src/trpc/router.ts +++ b/cloudflare-gastown/src/trpc/router.ts @@ -341,7 +341,6 @@ export const gastownRouter = router({ const ownerStub = ownership.stub; const townStub = getTownDOStub(ctx.env, input.townId); - await townStub.setTownId(input.townId); // For org towns, use the town owner's identity for credentials; // for personal towns the caller is always the owner. @@ -583,7 +582,6 @@ export const gastownRouter = router({ } const townStub = getTownDOStub(ctx.env, rig.town_id); - await townStub.setTownId(rig.town_id); return townStub.slingBead({ rigId: rig.id, title: input.title, @@ -609,7 +607,6 @@ export const gastownRouter = router({ await verifyTownOwnership(ctx.env, ctx.userId, input.townId, ctx.orgMemberships); const townStub = getTownDOStub(ctx.env, input.townId); - await townStub.setTownId(input.townId); return townStub.sendMayorMessage(input.message, input.model, input.uiContext); }), @@ -619,7 +616,6 @@ export const gastownRouter = router({ .query(async ({ ctx, input }) => { await verifyTownOwnership(ctx.env, ctx.userId, input.townId, ctx.orgMemberships); const townStub = getTownDOStub(ctx.env, input.townId); - await townStub.setTownId(input.townId); return townStub.getMayorStatus(); }), @@ -629,7 +625,6 @@ export const gastownRouter = router({ .query(async ({ ctx, input }) => { await verifyTownOwnership(ctx.env, ctx.userId, input.townId, ctx.orgMemberships); const townStub = getTownDOStub(ctx.env, input.townId); - await townStub.setTownId(input.townId); return townStub.getAlarmStatus(); }), @@ -666,7 +661,6 @@ export const gastownRouter = router({ } const townStub = getTownDOStub(ctx.env, input.townId); - await townStub.setTownId(input.townId); return townStub.ensureMayor(); }), @@ -861,7 +855,6 @@ export const gastownRouter = router({ .mutation(async ({ ctx, input }) => { await verifyTownOwnership(ctx.env, ctx.userId, input.townId, ctx.orgMemberships); const townStub = getTownDOStub(ctx.env, input.townId); - await townStub.setTownId(input.townId); await townStub.forceRefreshContainerToken(); }), @@ -1068,8 +1061,6 @@ export const gastownRouter = router({ if (!town) throw new TRPCError({ code: 'NOT_FOUND', message: 'Town not found' }); const townStub = getTownDOStub(ctx.env, input.townId); - await townStub.setTownId(input.townId); - // Use the town owner's identity for credentials. Only re-mint the // kilocode token if the caller is the owner (they have their pepper // in ctx). For non-owner members, keep the existing town token. @@ -1201,7 +1192,6 @@ export const gastownRouter = router({ .output(RpcAlarmStatusOutput) .query(async ({ ctx, input }) => { const townStub = getTownDOStub(ctx.env, input.townId); - await townStub.setTownId(input.townId); return townStub.getAlarmStatus(); }), From 88431a693c4f58a1e755e8d48cd864137a2084ee Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Wed, 18 Mar 2026 21:11:06 -0500 Subject: [PATCH 05/47] refactor(gastown): extract scheduling module, parallelize alarm loop, fix refinery recovery Extract dispatchAgent, dispatchUnblockedBeads, schedulePendingWork, and hasActiveWork into town/scheduling.ts (~360 lines out of Town.do.ts). Improve refinery recovery after container restart: - Reorder alarm: patrols run BEFORE scheduling so zombie agents detected by witnessPatrol are dispatched in the same tick instead of waiting for the next one. - Exclude refineries from schedulePendingWork so they always go through processReviewQueue with the full system prompt (branch, strategy, gates). recoverStuckReviews resets the MR bead to open after the timeout, and processReviewQueue re-pops it correctly. - Reduce REVIEW_RUNNING_TIMEOUT_MS from 5 min to 2.5 min so the review queue recovers faster after a container restart. Parallelize the alarm loop for lower latency: - Phase 1: witnessPatrol + deaconPatrol run in parallel (patrols) - Phase 2: review pipeline (sequential) runs in parallel with schedulePendingWork (disjoint agent types) - Phase 3: mail delivery, nudge expiry, re-escalation, and triage dispatch all run in parallel (fully independent) - Zombie detection container status checks run in parallel via Promise.allSettled instead of sequential for-loop. --- cloudflare-gastown/src/dos/Town.do.ts | 453 ++++-------------- .../src/dos/town/review-queue.ts | 7 +- cloudflare-gastown/src/dos/town/scheduling.ts | 364 ++++++++++++++ 3 files changed, 467 insertions(+), 357 deletions(-) create mode 100644 cloudflare-gastown/src/dos/town/scheduling.ts diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index 06d5b8851f..176d094091 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -26,13 +26,13 @@ import * as config from './town/config'; import * as rigs from './town/rigs'; import * as dispatch from './town/container-dispatch'; import * as patrol from './town/patrol'; +import * as scheduling from './town/scheduling'; import { GitHubPRStatusSchema, GitLabMRStatusSchema } from '../util/platform-pr.util'; // Table imports for beads-centric operations import { beads, BeadRecord, - AgentBeadRecord, EscalationBeadRecord, ConvoyBeadRecord, } from '../db/tables/beads.table'; @@ -115,8 +115,6 @@ function formatEventMessage(row: Record): string { // Alarm intervals const ACTIVE_ALARM_INTERVAL_MS = 5_000; // 5s when agents are active const IDLE_ALARM_INTERVAL_MS = 1 * 60_000; // 1m when idle -const DISPATCH_COOLDOWN_MS = 2 * 60_000; // 2 min — skip agents with recent dispatch activity -const MAX_DISPATCH_ATTEMPTS = 5; // Escalation constants const STALE_ESCALATION_THRESHOLD_MS = 4 * 60 * 60 * 1000; @@ -240,6 +238,20 @@ export class TownDO extends DurableObject { writeEvent(this.env, { ...data, delivery: 'internal', userId: this._ownerUserId }); } + /** Build the context object used by the scheduling sub-module. */ + private get schedulingCtx(): Parameters[0] { + return { + sql: this.sql, + env: this.env, + storage: this.ctx.storage, + townId: this.townId, + getTownConfig: () => this.getTownConfig(), + getRigConfig: (rigId: string) => this.getRigConfig(rigId), + resolveKilocodeToken: () => this.resolveKilocodeToken(), + emitEvent: data => this.emitEvent(data), + }; + } + // ── WebSocket: status broadcast ────────────────────────────────────── /** @@ -2735,56 +2747,68 @@ export class TownDO extends DurableObject { } } - try { - await this.processReviewQueue(); - } catch (err) { - console.error(`${TOWN_LOG} alarm: processReviewQueue failed`, err); - Sentry.captureException(err); - } - try { - await this.processConvoyLandings(); - } catch (err) { - console.error(`${TOWN_LOG} alarm: processConvoyLandings failed`, err); - Sentry.captureException(err); - } - try { - await this.schedulePendingWork(); - } catch (err) { - console.error(`${TOWN_LOG} alarm: schedulePendingWork failed`, err); - Sentry.captureException(err); - } - try { - await this.witnessPatrol(); - } catch (err) { - console.error(`${TOWN_LOG} alarm: witnessPatrol failed`, err); - Sentry.captureException(err); - } - try { - this.deaconPatrol(); - } catch (err) { - console.error(`${TOWN_LOG} alarm: deaconPatrol failed`, err); - Sentry.captureException(err); - } - try { - await this.deliverPendingMail(); - } catch (err) { - console.warn(`${TOWN_LOG} alarm: deliverPendingMail failed`, err); - } - try { - await this.expireStaleNudges(); - } catch (err) { - console.warn(`${TOWN_LOG} alarm: expireStaleNudges failed`, err); - } - try { - await this.reEscalateStaleEscalations(); - } catch (err) { - console.warn(`${TOWN_LOG} alarm: reEscalation failed`, err); - } - try { - await this.maybeDispatchTriageAgent(); - } catch (err) { - console.warn(`${TOWN_LOG} alarm: maybeDispatchTriageAgent failed`, err); - } + // ── Phase 1: Patrols (detect dead agents, recover stale state) ─── + // Patrols run first so that zombie agents are reset to idle and + // stale hooks are cleared before the scheduler tries to dispatch. + // This lets recovered agents dispatch in the same alarm tick + // instead of waiting for the next one. + await Promise.allSettled([ + this.witnessPatrol().catch(err => { + console.error(`${TOWN_LOG} alarm: witnessPatrol failed`, err); + Sentry.captureException(err); + }), + // deaconPatrol is sync — wrap in a resolved promise for allSettled + Promise.resolve().then(() => { + try { + this.deaconPatrol(); + } catch (err) { + console.error(`${TOWN_LOG} alarm: deaconPatrol failed`, err); + Sentry.captureException(err); + } + }), + ]); + + // ── Phase 2: Review pipeline + scheduling (dispatches agents) ── + // processReviewQueue and processConvoyLandings share the review + // queue, so they run sequentially. schedulePendingWork runs in + // parallel — it only handles non-refinery agents and reads + // disjoint state. + await Promise.allSettled([ + (async () => { + try { + await this.processReviewQueue(); + } catch (err) { + console.error(`${TOWN_LOG} alarm: processReviewQueue failed`, err); + Sentry.captureException(err); + } + try { + await this.processConvoyLandings(); + } catch (err) { + console.error(`${TOWN_LOG} alarm: processConvoyLandings failed`, err); + Sentry.captureException(err); + } + })(), + scheduling.schedulePendingWork(this.schedulingCtx).catch(err => { + console.error(`${TOWN_LOG} alarm: schedulePendingWork failed`, err); + Sentry.captureException(err); + }), + ]); + + // ── Phase 3: Housekeeping (independent, all parallelizable) ──── + await Promise.allSettled([ + this.deliverPendingMail().catch(err => + console.warn(`${TOWN_LOG} alarm: deliverPendingMail failed`, err) + ), + this.expireStaleNudges().catch(err => + console.warn(`${TOWN_LOG} alarm: expireStaleNudges failed`, err) + ), + this.reEscalateStaleEscalations().catch(err => + console.warn(`${TOWN_LOG} alarm: reEscalation failed`, err) + ), + this.maybeDispatchTriageAgent().catch(err => + console.warn(`${TOWN_LOG} alarm: maybeDispatchTriageAgent failed`, err) + ), + ]); // Re-arm: fast when active, slow when idle const active = this.hasActiveWork(); const interval = active ? ACTIVE_ALARM_INTERVAL_MS : IDLE_ALARM_INTERVAL_MS; @@ -2822,309 +2846,21 @@ export class TownDO extends DurableObject { } private hasActiveWork(): boolean { - const activeAgentRows = [ - ...query( - this.sql, - /* sql */ `SELECT COUNT(*) as cnt FROM ${agent_metadata} WHERE ${agent_metadata.status} IN ('working', 'stalled')`, - [] - ), - ]; - const pendingBeadRows = [ - ...query( - this.sql, - /* sql */ `SELECT COUNT(*) as cnt FROM ${agent_metadata} WHERE ${agent_metadata.status} = 'idle' AND ${agent_metadata.current_hook_bead_id} IS NOT NULL`, - [] - ), - ]; - const pendingReviewRows = [ - ...query( - this.sql, - /* sql */ `SELECT COUNT(*) as cnt FROM ${beads} WHERE ${beads.type} = 'merge_request' AND ${beads.status} IN ('open', 'in_progress')`, - [] - ), - ]; - const pendingTriageRows = [ - ...query( - this.sql, - /* sql */ `SELECT COUNT(*) as cnt FROM ${beads} WHERE ${beads.type} = 'issue' AND ${beads.labels} LIKE ? AND ${beads.status} = 'open'`, - [patrol.TRIAGE_LABEL_LIKE] - ), - ]; - return ( - Number(activeAgentRows[0]?.cnt ?? 0) > 0 || - Number(pendingBeadRows[0]?.cnt ?? 0) > 0 || - Number(pendingReviewRows[0]?.cnt ?? 0) > 0 || - Number(pendingTriageRows[0]?.cnt ?? 0) > 0 - ); + return scheduling.hasActiveWork(this.sql); } - /** - * Dispatch a single agent to the container. Used for eager dispatch from - * slingBead (so agents start immediately) and from schedulePendingWork - * (periodic recovery). Returns true if the agent was started. - */ - private async dispatchAgent(agent: Agent, bead: Bead): Promise { - try { - const rigId = agent.rig_id ?? rigs.listRigs(this.sql)[0]?.id ?? ''; - const rigConfig = rigId ? await this.getRigConfig(rigId) : null; - if (!rigConfig) { - console.warn(`${TOWN_LOG} dispatchAgent: no rig config for agent=${agent.id} rig=${rigId}`); - return false; - } - - const townConfig = await this.getTownConfig(); - const kilocodeToken = await this.resolveKilocodeToken(); - - // Check if this bead belongs to a convoy and resolve its feature branch. - // Convoy beads branch from the feature branch, not from defaultBranch. - const convoyId = beadOps.getConvoyForBead(this.sql, bead.bead_id); - const convoyFeatureBranch = convoyId - ? beadOps.getConvoyFeatureBranch(this.sql, convoyId) - : null; - - // Transition the bead to in_progress BEFORE starting the container. - // This must happen synchronously within the DO's I/O gate — the - // fire-and-forget pattern used by slingBead/slingConvoy means the - // calling RPC may return before startAgentInContainer completes, - // closing the I/O gate and preventing further SQL writes. - const currentBead = beadOps.getBead(this.sql, bead.bead_id); - if ( - currentBead && - currentBead.status !== 'in_progress' && - currentBead.status !== 'closed' && - currentBead.status !== 'failed' - ) { - beadOps.updateBeadStatus(this.sql, bead.bead_id, 'in_progress', agent.id); - } - - // Set status to 'working' BEFORE the async container start. This - // must happen synchronously so the SQL write executes while the I/O - // gate is still open. When dispatchAgent is called fire-and-forget - // (from slingBead, slingConvoy, dispatchUnblockedBeads), any SQL - // writes after the first `await` may be silently dropped because - // the DO's RPC response closes the I/O gate. If the container fails - // to start, we roll back to 'idle'. - const timestamp = now(); - query( - this.sql, - /* sql */ ` - UPDATE ${agent_metadata} - SET ${agent_metadata.columns.status} = 'working', - ${agent_metadata.columns.dispatch_attempts} = ${agent_metadata.columns.dispatch_attempts} + 1, - ${agent_metadata.columns.last_activity_at} = ? - WHERE ${agent_metadata.bead_id} = ? - `, - [timestamp, agent.id] - ); - - const started = await dispatch.startAgentInContainer(this.env, this.ctx.storage, { - townId: this.townId, - rigId, - userId: rigConfig.userId, - agentId: agent.id, - agentName: agent.name, - role: agent.role, - identity: agent.identity, - beadId: bead.bead_id, - beadTitle: bead.title, - beadBody: bead.body ?? '', - checkpoint: agent.checkpoint, - gitUrl: rigConfig.gitUrl, - defaultBranch: rigConfig.defaultBranch, - kilocodeToken, - townConfig, - platformIntegrationId: rigConfig.platformIntegrationId, - convoyFeatureBranch: convoyFeatureBranch ?? undefined, - }); - - if (started) { - // Reset dispatch_attempts on success (best-effort — may be - // dropped if the I/O gate is already closed, but that's fine - // because the agent is already 'working'). - query( - this.sql, - /* sql */ ` - UPDATE ${agent_metadata} - SET ${agent_metadata.columns.dispatch_attempts} = 0 - WHERE ${agent_metadata.bead_id} = ? - `, - [agent.id] - ); - console.log(`${TOWN_LOG} dispatchAgent: started agent=${agent.name}(${agent.id})`); - this.emitEvent({ - event: 'agent.spawned', - townId: this.townId, - rigId, - agentId: agent.id, - beadId: bead.bead_id, - role: agent.role, - }); - } else { - // Container failed to start — roll back agent to idle and bead - // to open so schedulePendingWork can retry on the next alarm. - query( - this.sql, - /* sql */ ` - UPDATE ${agent_metadata} - SET ${agent_metadata.columns.status} = 'idle' - WHERE ${agent_metadata.bead_id} = ? - `, - [agent.id] - ); - if (agent.current_hook_bead_id) { - beadOps.updateBeadStatus(this.sql, agent.current_hook_bead_id, 'open', agent.id); - } - this.emitEvent({ - event: 'agent.dispatch_failed', - townId: this.townId, - rigId, - agentId: agent.id, - beadId: bead.bead_id, - role: agent.role, - }); - } - return started; - } catch (err) { - console.error(`${TOWN_LOG} dispatchAgent: failed for agent=${agent.id}:`, err); - Sentry.captureException(err, { extra: { agentId: agent.id, beadId: bead.bead_id } }); - // Roll back agent and bead to prevent them from being stuck in - // working/in_progress state when the container call throws. - try { - query( - this.sql, - /* sql */ ` - UPDATE ${agent_metadata} - SET ${agent_metadata.columns.status} = 'idle' - WHERE ${agent_metadata.bead_id} = ? - `, - [agent.id] - ); - if (agent.current_hook_bead_id) { - beadOps.updateBeadStatus(this.sql, agent.current_hook_bead_id, 'open', agent.id); - } - } catch (rollbackErr) { - console.error(`${TOWN_LOG} dispatchAgent: rollback also failed:`, rollbackErr); - } - this.emitEvent({ - event: 'agent.dispatch_failed', - townId: this.townId, - agentId: agent.id, - beadId: bead.bead_id, - role: agent.role, - }); - return false; - } + /** Dispatch a single agent to the container. Delegates to scheduling module. */ + private dispatchAgent( + agent: Agent, + bead: Bead, + options?: { systemPromptOverride?: string } + ): Promise { + return scheduling.dispatchAgent(this.schedulingCtx, agent, bead, options); } - /** - * When a bead closes, find beads that were blocked by it and are now - * fully unblocked (all 'blocks' dependencies resolved). Dispatch their - * assigned agents. - */ + /** When a bead closes, dispatch any beads it was blocking. */ private dispatchUnblockedBeads(closedBeadId: string): void { - const unblockedIds = beadOps.getNewlyUnblockedBeads(this.sql, closedBeadId); - if (unblockedIds.length === 0) return; - - console.log( - `${TOWN_LOG} dispatchUnblockedBeads: ${unblockedIds.length} beads unblocked by ${closedBeadId}` - ); - - for (const beadId of unblockedIds) { - const bead = beadOps.getBead(this.sql, beadId); - if (!bead || bead.status === 'closed' || bead.status === 'failed') continue; - - // Find the agent hooked to this bead - if (!bead.assignee_agent_bead_id) continue; - const agent = agents.getAgent(this.sql, bead.assignee_agent_bead_id); - if (!agent || agent.status !== 'idle') continue; - - this.dispatchAgent(agent, bead).catch(err => - console.error( - `${TOWN_LOG} dispatchUnblockedBeads: fire-and-forget dispatch failed for bead=${beadId}`, - err - ) - ); - } - } - - /** - * Find idle agents with hooked beads and dispatch them to the container. - * Agents whose last_activity_at is within the dispatch cooldown are - * skipped — they have a fire-and-forget dispatch already in flight. - */ - private async schedulePendingWork(): Promise { - const cooldownCutoff = new Date(Date.now() - DISPATCH_COOLDOWN_MS).toISOString(); - const rows = [ - ...query( - this.sql, - /* sql */ ` - SELECT ${beads}.*, - ${agent_metadata.role}, ${agent_metadata.identity}, - ${agent_metadata.container_process_id}, - ${agent_metadata.status} AS status, - ${agent_metadata.current_hook_bead_id}, - ${agent_metadata.dispatch_attempts}, ${agent_metadata.last_activity_at}, - ${agent_metadata.checkpoint}, - ${agent_metadata.agent_status_message}, ${agent_metadata.agent_status_updated_at} - FROM ${beads} - INNER JOIN ${agent_metadata} ON ${beads.bead_id} = ${agent_metadata.bead_id} - WHERE ${agent_metadata.status} = 'idle' - AND ${agent_metadata.current_hook_bead_id} IS NOT NULL - AND (${agent_metadata.last_activity_at} IS NULL OR ${agent_metadata.last_activity_at} < ?) - `, - [cooldownCutoff] - ), - ]; - const pendingAgents: Agent[] = AgentBeadRecord.array() - .parse(rows) - .map(row => ({ - id: row.bead_id, - rig_id: row.rig_id, - role: row.role, - name: row.title, - identity: row.identity, - status: row.status, - current_hook_bead_id: row.current_hook_bead_id, - dispatch_attempts: row.dispatch_attempts, - last_activity_at: row.last_activity_at, - checkpoint: row.checkpoint, - created_at: row.created_at, - agent_status_message: row.agent_status_message, - agent_status_updated_at: row.agent_status_updated_at, - })); - - console.log(`${TOWN_LOG} schedulePendingWork: found ${pendingAgents.length} pending agents`); - if (pendingAgents.length === 0) return; - - const dispatchTasks: Array<() => Promise> = []; - - for (const agent of pendingAgents) { - const beadId = agent.current_hook_bead_id; - if (!beadId) continue; - const bead = beadOps.getBead(this.sql, beadId); - if (!bead) continue; - - if (agent.dispatch_attempts >= MAX_DISPATCH_ATTEMPTS) { - beadOps.updateBeadStatus(this.sql, beadId, 'failed', agent.id); - agents.unhookBead(this.sql, agent.id); - continue; - } - - // Skip beads that still have unresolved 'blocks' dependencies — - // they'll be dispatched by dispatchUnblockedBeads when their - // blockers close. - if (beadOps.hasUnresolvedBlockers(this.sql, beadId)) { - continue; - } - - dispatchTasks.push(async () => { - await this.dispatchAgent(agent, bead); - }); - } - - if (dispatchTasks.length > 0) { - await Promise.allSettled(dispatchTasks.map(fn => fn())); - } + scheduling.dispatchUnblockedBeads(this.schedulingCtx, closedBeadId); } /** @@ -3156,10 +2892,18 @@ export class TownDO extends DurableObject { ), ]); - for (const working of workingAgents) { + // Check container status for all working agents in parallel to + // avoid serial network round-trips (one per agent). + const statusChecks = workingAgents.map(async working => { const agentId = working.bead_id; - const containerInfo = await dispatch.checkAgentContainerStatus(this.env, townId, agentId); + return { agentId, containerInfo }; + }); + const statusResults = await Promise.allSettled(statusChecks); + + for (const result of statusResults) { + if (result.status !== 'fulfilled') continue; + const { agentId, containerInfo } = result.value; if (containerInfo.status === 'not_found' || containerInfo.status === 'exited') { if (containerInfo.exitReason === 'completed') { @@ -3176,7 +2920,6 @@ export class TownDO extends DurableObject { `, [now(), agentId] ); - continue; } } @@ -3271,7 +3014,7 @@ export class TownDO extends DurableObject { patrol.TRIAGE_REQUEST_LABEL, patrol.TRIAGE_BATCH_LABEL ); - const cooldownCutoff = new Date(Date.now() - DISPATCH_COOLDOWN_MS).toISOString(); + const cooldownCutoff = new Date(Date.now() - scheduling.DISPATCH_COOLDOWN_MS).toISOString(); const existingBatch = [ ...query( this.sql, diff --git a/cloudflare-gastown/src/dos/town/review-queue.ts b/cloudflare-gastown/src/dos/town/review-queue.ts index 8150a21185..26c02d0c69 100644 --- a/cloudflare-gastown/src/dos/town/review-queue.ts +++ b/cloudflare-gastown/src/dos/town/review-queue.ts @@ -28,8 +28,11 @@ import { getAgent, unhookBead } from './agents'; import { getRig } from './rigs'; import type { ReviewQueueInput, ReviewQueueEntry, AgentDoneInput, Molecule } from '../../types'; -// Review entries stuck in 'running' past this timeout are reset to 'pending' -const REVIEW_RUNNING_TIMEOUT_MS = 5 * 60 * 1000; +// Review entries stuck in 'running' past this timeout are reset to 'pending'. +// Set slightly above the dispatch cooldown (2 min) so that zombie +// detection + cooldown expiry have a chance to recover the agent +// before we force-reset the MR bead. +const REVIEW_RUNNING_TIMEOUT_MS = 2.5 * 60 * 1000; function generateId(): string { return crypto.randomUUID(); diff --git a/cloudflare-gastown/src/dos/town/scheduling.ts b/cloudflare-gastown/src/dos/town/scheduling.ts new file mode 100644 index 0000000000..a250adcb41 --- /dev/null +++ b/cloudflare-gastown/src/dos/town/scheduling.ts @@ -0,0 +1,364 @@ +/** + * Agent scheduling and dispatch for the Town DO alarm loop. + * + * Owns the core dispatch/retry logic that was previously inline in + * Town.do.ts. The Town DO delegates to these pure(ish) functions, + * passing its SQL handle and env bindings. + */ + +import * as Sentry from '@sentry/cloudflare'; +import { beads, AgentBeadRecord } from '../../db/tables/beads.table'; +import { agent_metadata } from '../../db/tables/agent-metadata.table'; +import { query } from '../../util/query.util'; +import * as beadOps from './beads'; +import * as agents from './agents'; +import * as rigs from './rigs'; +import * as dispatch from './container-dispatch'; +import * as patrol from './patrol'; +import type { Agent, Bead, TownConfig } from '../../types'; +import type { GastownEventData } from '../../util/analytics.util'; + +const LOG = '[scheduling]'; + +// ── Constants ────────────────────────────────────────────────────────── + +export const DISPATCH_COOLDOWN_MS = 2 * 60_000; // 2 min +export const MAX_DISPATCH_ATTEMPTS = 5; + +// ── Context passed by the Town DO ────────────────────────────────────── + +type SchedulingContext = { + sql: SqlStorage; + env: Env; + storage: DurableObjectStorage; + townId: string; + getTownConfig: () => Promise; + getRigConfig: (rigId: string) => Promise; + resolveKilocodeToken: () => Promise; + emitEvent: (data: Omit) => void; +}; + +type RigConfig = { + townId: string; + rigId: string; + gitUrl: string; + defaultBranch: string; + userId: string; + kilocodeToken?: string; + platformIntegrationId?: string; + merge_strategy?: string; +}; + +function now(): string { + return new Date().toISOString(); +} + +// ── dispatchAgent ────────────────────────────────────────────────────── + +/** + * Dispatch a single agent to the container. Transitions the bead to + * in_progress and the agent to working BEFORE the async network call + * (I/O gate safety for fire-and-forget callers). Returns true if the + * container accepted the agent. + */ +export async function dispatchAgent( + ctx: SchedulingContext, + agent: Agent, + bead: Bead, + options?: { systemPromptOverride?: string } +): Promise { + try { + const rigId = agent.rig_id ?? rigs.listRigs(ctx.sql)[0]?.id ?? ''; + const rigConfig = rigId ? await ctx.getRigConfig(rigId) : null; + if (!rigConfig) { + console.warn(`${LOG} dispatchAgent: no rig config for agent=${agent.id} rig=${rigId}`); + return false; + } + + const townConfig = await ctx.getTownConfig(); + const kilocodeToken = await ctx.resolveKilocodeToken(); + + const convoyId = beadOps.getConvoyForBead(ctx.sql, bead.bead_id); + const convoyFeatureBranch = convoyId ? beadOps.getConvoyFeatureBranch(ctx.sql, convoyId) : null; + + // Transition bead to in_progress BEFORE the async container start. + // Must happen synchronously within the I/O gate — fire-and-forget + // callers (slingBead, slingConvoy) close the gate before the + // network call completes. + const currentBead = beadOps.getBead(ctx.sql, bead.bead_id); + if ( + currentBead && + currentBead.status !== 'in_progress' && + currentBead.status !== 'closed' && + currentBead.status !== 'failed' + ) { + beadOps.updateBeadStatus(ctx.sql, bead.bead_id, 'in_progress', agent.id); + } + + // Set agent to 'working' BEFORE the async container start (same + // I/O gate rationale). + const timestamp = now(); + query( + ctx.sql, + /* sql */ ` + UPDATE ${agent_metadata} + SET ${agent_metadata.columns.status} = 'working', + ${agent_metadata.columns.dispatch_attempts} = ${agent_metadata.columns.dispatch_attempts} + 1, + ${agent_metadata.columns.last_activity_at} = ? + WHERE ${agent_metadata.bead_id} = ? + `, + [timestamp, agent.id] + ); + + const started = await dispatch.startAgentInContainer(ctx.env, ctx.storage, { + townId: ctx.townId, + rigId, + userId: rigConfig.userId, + agentId: agent.id, + agentName: agent.name, + role: agent.role, + identity: agent.identity, + beadId: bead.bead_id, + beadTitle: bead.title, + beadBody: bead.body ?? '', + checkpoint: agent.checkpoint, + gitUrl: rigConfig.gitUrl, + defaultBranch: rigConfig.defaultBranch, + kilocodeToken, + townConfig, + platformIntegrationId: rigConfig.platformIntegrationId, + convoyFeatureBranch: convoyFeatureBranch ?? undefined, + systemPromptOverride: options?.systemPromptOverride, + }); + + if (started) { + // Best-effort: may be dropped if I/O gate is closed + query( + ctx.sql, + /* sql */ ` + UPDATE ${agent_metadata} + SET ${agent_metadata.columns.dispatch_attempts} = 0 + WHERE ${agent_metadata.bead_id} = ? + `, + [agent.id] + ); + console.log(`${LOG} dispatchAgent: started agent=${agent.name}(${agent.id})`); + ctx.emitEvent({ + event: 'agent.spawned', + townId: ctx.townId, + rigId, + agentId: agent.id, + beadId: bead.bead_id, + role: agent.role, + }); + } else { + // Container failed — roll back agent to idle, bead to open + query( + ctx.sql, + /* sql */ ` + UPDATE ${agent_metadata} + SET ${agent_metadata.columns.status} = 'idle' + WHERE ${agent_metadata.bead_id} = ? + `, + [agent.id] + ); + if (agent.current_hook_bead_id) { + beadOps.updateBeadStatus(ctx.sql, agent.current_hook_bead_id, 'open', agent.id); + } + ctx.emitEvent({ + event: 'agent.dispatch_failed', + townId: ctx.townId, + rigId, + agentId: agent.id, + beadId: bead.bead_id, + role: agent.role, + }); + } + return started; + } catch (err) { + console.error(`${LOG} dispatchAgent: failed for agent=${agent.id}:`, err); + Sentry.captureException(err, { extra: { agentId: agent.id, beadId: bead.bead_id } }); + try { + query( + ctx.sql, + /* sql */ ` + UPDATE ${agent_metadata} + SET ${agent_metadata.columns.status} = 'idle' + WHERE ${agent_metadata.bead_id} = ? + `, + [agent.id] + ); + if (agent.current_hook_bead_id) { + beadOps.updateBeadStatus(ctx.sql, agent.current_hook_bead_id, 'open', agent.id); + } + } catch (rollbackErr) { + console.error(`${LOG} dispatchAgent: rollback also failed:`, rollbackErr); + } + ctx.emitEvent({ + event: 'agent.dispatch_failed', + townId: ctx.townId, + agentId: agent.id, + beadId: bead.bead_id, + role: agent.role, + }); + return false; + } +} + +// ── dispatchUnblockedBeads ───────────────────────────────────────────── + +/** + * When a bead closes, find beads that were blocked by it and are now + * fully unblocked. Dispatch their assigned agents (fire-and-forget). + */ +export function dispatchUnblockedBeads(ctx: SchedulingContext, closedBeadId: string): void { + const unblockedIds = beadOps.getNewlyUnblockedBeads(ctx.sql, closedBeadId); + if (unblockedIds.length === 0) return; + + console.log( + `${LOG} dispatchUnblockedBeads: ${unblockedIds.length} beads unblocked by ${closedBeadId}` + ); + + for (const beadId of unblockedIds) { + const bead = beadOps.getBead(ctx.sql, beadId); + if (!bead || bead.status === 'closed' || bead.status === 'failed') continue; + + if (!bead.assignee_agent_bead_id) continue; + const agent = agents.getAgent(ctx.sql, bead.assignee_agent_bead_id); + if (!agent || agent.status !== 'idle') continue; + + dispatchAgent(ctx, agent, bead).catch(err => + console.error( + `${LOG} dispatchUnblockedBeads: fire-and-forget dispatch failed for bead=${beadId}`, + err + ) + ); + } +} + +// ── schedulePendingWork ──────────────────────────────────────────────── + +/** + * Find idle agents with hooked beads and dispatch them. Agents within + * the dispatch cooldown are skipped (fire-and-forget dispatch in flight). + * + * Refineries are excluded — they must go through processReviewQueue so + * they receive the full system prompt with branch, strategy, and gate + * context. recoverStuckReviews resets their MR bead to 'open' after the + * timeout, and processReviewQueue re-pops it with the correct prompt. + */ +export async function schedulePendingWork(ctx: SchedulingContext): Promise { + const cooldownCutoff = new Date(Date.now() - DISPATCH_COOLDOWN_MS).toISOString(); + const rows = [ + ...query( + ctx.sql, + /* sql */ ` + SELECT ${beads}.*, + ${agent_metadata.role}, ${agent_metadata.identity}, + ${agent_metadata.container_process_id}, + ${agent_metadata.status} AS status, + ${agent_metadata.current_hook_bead_id}, + ${agent_metadata.dispatch_attempts}, ${agent_metadata.last_activity_at}, + ${agent_metadata.checkpoint}, + ${agent_metadata.agent_status_message}, ${agent_metadata.agent_status_updated_at} + FROM ${beads} + INNER JOIN ${agent_metadata} ON ${beads.bead_id} = ${agent_metadata.bead_id} + WHERE ${agent_metadata.status} = 'idle' + AND ${agent_metadata.current_hook_bead_id} IS NOT NULL + AND ${agent_metadata.role} != 'refinery' + AND (${agent_metadata.last_activity_at} IS NULL OR ${agent_metadata.last_activity_at} < ?) + `, + [cooldownCutoff] + ), + ]; + const pendingAgents: Agent[] = AgentBeadRecord.array() + .parse(rows) + .map(row => ({ + id: row.bead_id, + rig_id: row.rig_id, + role: row.role, + name: row.title, + identity: row.identity, + status: row.status, + current_hook_bead_id: row.current_hook_bead_id, + dispatch_attempts: row.dispatch_attempts, + last_activity_at: row.last_activity_at, + checkpoint: row.checkpoint, + created_at: row.created_at, + agent_status_message: row.agent_status_message, + agent_status_updated_at: row.agent_status_updated_at, + })); + + console.log(`${LOG} schedulePendingWork: found ${pendingAgents.length} pending agents`); + if (pendingAgents.length === 0) return; + + const dispatchTasks: Array<() => Promise> = []; + + for (const agent of pendingAgents) { + const beadId = agent.current_hook_bead_id; + if (!beadId) continue; + const bead = beadOps.getBead(ctx.sql, beadId); + if (!bead) continue; + + if (agent.dispatch_attempts >= MAX_DISPATCH_ATTEMPTS) { + beadOps.updateBeadStatus(ctx.sql, beadId, 'failed', agent.id); + agents.unhookBead(ctx.sql, agent.id); + continue; + } + + if (beadOps.hasUnresolvedBlockers(ctx.sql, beadId)) { + continue; + } + + dispatchTasks.push(async () => { + await dispatchAgent(ctx, agent, bead); + }); + } + + if (dispatchTasks.length > 0) { + await Promise.allSettled(dispatchTasks.map(fn => fn())); + } +} + +// ── hasActiveWork ────────────────────────────────────────────────────── + +/** + * Returns true if the town has work that requires the fast (5s) alarm + * interval. Used to decide between active and idle alarm cadence. + */ +export function hasActiveWork(sql: SqlStorage): boolean { + const activeAgentRows = [ + ...query( + sql, + /* sql */ `SELECT COUNT(*) as cnt FROM ${agent_metadata} WHERE ${agent_metadata.status} IN ('working', 'stalled')`, + [] + ), + ]; + const pendingBeadRows = [ + ...query( + sql, + /* sql */ `SELECT COUNT(*) as cnt FROM ${agent_metadata} WHERE ${agent_metadata.status} = 'idle' AND ${agent_metadata.current_hook_bead_id} IS NOT NULL`, + [] + ), + ]; + const pendingReviewRows = [ + ...query( + sql, + /* sql */ `SELECT COUNT(*) as cnt FROM ${beads} WHERE ${beads.type} = 'merge_request' AND ${beads.status} IN ('open', 'in_progress')`, + [] + ), + ]; + const pendingTriageRows = [ + ...query( + sql, + /* sql */ `SELECT COUNT(*) as cnt FROM ${beads} WHERE ${beads.type} = 'issue' AND ${beads.labels} LIKE ? AND ${beads.status} = 'open'`, + [patrol.TRIAGE_LABEL_LIKE] + ), + ]; + return ( + Number(activeAgentRows[0]?.cnt ?? 0) > 0 || + Number(pendingBeadRows[0]?.cnt ?? 0) > 0 || + Number(pendingReviewRows[0]?.cnt ?? 0) > 0 || + Number(pendingTriageRows[0]?.cnt ?? 0) > 0 + ); +} From 489823e6b4e6a785ef70b83b3a3110b5c5f84175 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Wed, 18 Mar 2026 21:38:50 -0500 Subject: [PATCH 06/47] fix(container): configure credential helper on bare repo for git-lfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit git-lfs smudge filters triggered by worktree creation (git worktree add, git reset --hard) need credentials to download LFS objects from the batch API. The token is embedded in the remote URL, but some git-lfs versions resolve credentials through the credential helper chain for the LFS batch endpoint (which uses a different URL path). Configure a credential-store helper on the bare repo right after clone and on every fetch (in case the token rotated), so all worktrees — including the browse worktree created by /repos/setup — inherit working credentials for LFS operations. --- .../container/src/git-manager.ts | 47 ++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/cloudflare-gastown/container/src/git-manager.ts b/cloudflare-gastown/container/src/git-manager.ts index 24b312d43c..2d154d8769 100644 --- a/cloudflare-gastown/container/src/git-manager.ts +++ b/cloudflare-gastown/container/src/git-manager.ts @@ -1,4 +1,4 @@ -import { mkdir, realpath, rm, stat } from 'node:fs/promises'; +import { mkdir, realpath, rm, stat, writeFile } from 'node:fs/promises'; import { join, resolve } from 'node:path'; import type { CloneOptions, WorktreeOptions } from './types'; @@ -105,6 +105,49 @@ function authenticateGitUrl(gitUrl: string, envVars?: Record): s return gitUrl; } +/** + * Configure a credential-store helper on the bare repo so that worktree + * operations (checkout, reset, lfs smudge) can resolve credentials + * through the standard git credential chain. + * + * Without this, git-lfs smudge filters triggered by `git worktree add` + * or `git reset --hard` fail with "Smudge error" because the LFS batch + * API request has no credentials. The token is embedded in the remote + * URL, but some git-lfs versions require the credential helper for the + * LFS batch endpoint (which uses a different URL path). + */ +async function configureRepoCredentials( + repoDir: string, + gitUrl: string, + envVars?: Record +): Promise { + if (!envVars) return; + + const token = envVars.GIT_TOKEN ?? envVars.GITHUB_TOKEN; + const gitlabToken = envVars.GITLAB_TOKEN; + if (!token && !gitlabToken) return; + + try { + const url = new URL(gitUrl); + const credentialLine = + gitlabToken && (url.hostname.includes('gitlab') || envVars.GITLAB_INSTANCE_URL) + ? `https://oauth2:${gitlabToken}@${url.hostname}` + : token + ? `https://x-access-token:${token}@${url.hostname}` + : null; + + if (!credentialLine) return; + + // Write to a per-repo credential file outside the repo itself + const credFile = `/tmp/.git-credentials-repo-${repoDir.replace(/[^a-zA-Z0-9]/g, '-')}`; + await writeFile(credFile, credentialLine + '\n', { mode: 0o600 }); + + await exec('git', ['config', 'credential.helper', `store --file=${credFile}`], repoDir); + } catch (err) { + console.warn(`Failed to configure repo credentials for ${repoDir}:`, err); + } +} + /** * Validate a branch name — block control characters and shell metacharacters. */ @@ -211,6 +254,7 @@ async function cloneRepoInner( await exec('git', ['remote', 'set-url', 'origin', authUrl], dir).catch(err => { console.warn(`Failed to update remote URL for rig ${options.rigId}:`, err); }); + await configureRepoCredentials(dir, options.gitUrl, options.envVars); await exec('git', ['fetch', '--all', '--prune'], dir); console.log(`Fetched latest for rig ${options.rigId}`); return dir; @@ -228,6 +272,7 @@ async function cloneRepoInner( await mkdir(dir, { recursive: true }); await exec('git', ['clone', '--no-checkout', '--branch', options.defaultBranch, authUrl, dir]); + await configureRepoCredentials(dir, options.gitUrl, options.envVars); console.log(`Cloned repo for rig ${options.rigId}`); return dir; } From 19013be503e2f7f8bfcfb8ad40839b94454f619e Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Wed, 18 Mar 2026 21:47:35 -0500 Subject: [PATCH 07/47] fix(gastown): add rehookOrphanedBeads patrol to recover stuck beads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After container restarts, review failures, or rework cycles, beads can end up in 'open' with a stale assignee_agent_bead_id — the agent was unhooked but the bead still references it. Neither feedStrandedConvoys (requires assignee IS NULL) nor schedulePendingWork (requires agent hooked) picks these up, leaving them permanently stuck. New rehookOrphanedBeads() patrol function finds open issue beads where the assigned agent's current_hook_bead_id doesn't point back to the bead, and re-hooks a polecat so schedulePendingWork dispatches it on the next tick. --- cloudflare-gastown/src/dos/Town.do.ts | 3 + cloudflare-gastown/src/dos/town/patrol.ts | 73 +++++++++++++++++++++++ 2 files changed, 76 insertions(+) diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index 176d094091..a2ec376108 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -2992,6 +2992,9 @@ export class TownDO extends DurableObject { // ── Stranded convoy feeding ──────────────────────────────────── patrol.feedStrandedConvoys(this.sql, this.townId); + // ── Orphaned bead re-hooking ─────────────────────────────────── + patrol.rehookOrphanedBeads(this.sql, this.townId); + // ── Crash loop detection ─────────────────────────────────────── patrol.detectCrashLoops(this.sql); } diff --git a/cloudflare-gastown/src/dos/town/patrol.ts b/cloudflare-gastown/src/dos/town/patrol.ts index 5fb78db442..e3675eb5e1 100644 --- a/cloudflare-gastown/src/dos/town/patrol.ts +++ b/cloudflare-gastown/src/dos/town/patrol.ts @@ -586,6 +586,79 @@ export function feedStrandedConvoys(sql: SqlStorage, townId: string): void { } } +/** + * Recover open beads whose assigned agent is no longer hooked to them. + * + * After container restarts, review failures, or rework cycles, beads + * can end up in 'open' with a stale assignee_agent_bead_id — the agent + * has been unhooked but the bead still references it. Neither + * feedStrandedConvoys (requires assignee IS NULL) nor schedulePendingWork + * (requires agent hooked) will pick these up. + * + * For each orphaned bead, re-hook the assigned agent (or a fresh one + * if the original is busy) so schedulePendingWork dispatches it on the + * next tick. + */ +export function rehookOrphanedBeads(sql: SqlStorage, townId: string): void { + const OrphanedBeadRow = z.object({ + bead_id: z.string(), + rig_id: z.string().nullable(), + assignee_agent_bead_id: z.string(), + }); + + // Find open issue beads where the assigned agent's current_hook_bead_id + // does NOT point back to this bead (either NULL or hooked elsewhere). + const rows = OrphanedBeadRow.array().parse([ + ...query( + sql, + /* sql */ ` + SELECT ${beads.bead_id}, + ${beads.rig_id}, + ${beads.assignee_agent_bead_id} + FROM ${beads} + INNER JOIN ${agent_metadata} + ON ${agent_metadata.bead_id} = ${beads.assignee_agent_bead_id} + WHERE ${beads.status} = 'open' + AND ${beads.type} = 'issue' + AND ${beads.assignee_agent_bead_id} IS NOT NULL + AND ( + ${agent_metadata.current_hook_bead_id} IS NULL + OR ${agent_metadata.current_hook_bead_id} != ${beads.bead_id} + ) + `, + [] + ), + ]); + + if (rows.length === 0) return; + + console.log(`${LOG} rehookOrphanedBeads: found ${rows.length} orphaned bead(s)`); + + for (const row of rows) { + const rigId = row.rig_id; + if (!rigId) continue; + + try { + // Prefer re-using the original agent if it's idle+unhooked. + // Otherwise getOrCreateAgent finds or creates a fresh polecat. + const agent = getOrCreateAgent(sql, 'polecat', rigId, townId); + hookBead(sql, agent.id, row.bead_id); + query( + sql, + /* sql */ ` + UPDATE ${agent_metadata} + SET ${agent_metadata.columns.last_activity_at} = NULL + WHERE ${agent_metadata.bead_id} = ? + `, + [agent.id] + ); + console.log(`${LOG} rehookOrphanedBeads: re-hooked agent=${agent.id} to bead=${row.bead_id}`); + } catch (err) { + console.warn(`${LOG} rehookOrphanedBeads: failed to re-hook bead=${row.bead_id}:`, err); + } + } +} + /** * Detect crash loops: agents that have failed repeatedly within a * short window. Creates a triage request for LLM assessment. From 82a1a00309301d2f8282bd6052e2a12d02eacebd Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Wed, 18 Mar 2026 22:09:55 -0500 Subject: [PATCH 08/47] fix(gastown): add timeouts to container fetch calls and treat unknown agent status as not_found Container fetch calls through the Container class auto-start the container and wait for port readiness, which can block the alarm loop during container restarts/deploys. Add AbortSignal.timeout to: - ensureContainerReady health check (5s) - checkAgentContainerStatus (5s) - startAgentInContainer (60s) - refreshContainerToken (10s) Also treat non-OK responses and errors in checkAgentContainerStatus as not_found instead of unknown, so zombie detection resets the agent immediately rather than leaving it stuck in working state. --- cloudflare-gastown/src/dos/Town.do.ts | 4 +++- .../src/dos/town/container-dispatch.ts | 18 ++++++++++++------ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index a2ec376108..b4e28e18dc 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -3755,7 +3755,9 @@ export class TownDO extends DurableObject { try { const container = getTownContainerStub(this.env, townId); - await container.fetch('http://container/health'); + await container.fetch('http://container/health', { + signal: AbortSignal.timeout(5_000), + }); } catch { // Container is starting up or unavailable — alarm will retry } diff --git a/cloudflare-gastown/src/dos/town/container-dispatch.ts b/cloudflare-gastown/src/dos/town/container-dispatch.ts index 5f56a0608c..3098c9cea3 100644 --- a/cloudflare-gastown/src/dos/town/container-dispatch.ts +++ b/cloudflare-gastown/src/dos/town/container-dispatch.ts @@ -101,6 +101,7 @@ export async function ensureContainerToken( try { const resp = await container.fetch('http://container/refresh-token', { method: 'POST', + signal: AbortSignal.timeout(10_000), headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ token }), }); @@ -376,6 +377,7 @@ export async function startAgentInContainer( const response = await container.fetch('http://container/agents/start', { method: 'POST', + signal: AbortSignal.timeout(60_000), headers: { 'Content-Type': 'application/json', 'X-Town-Config': JSON.stringify(containerConfig), @@ -533,27 +535,31 @@ export async function checkAgentContainerStatus( ): Promise<{ status: string; exitReason?: string }> { try { const container = getTownContainerStub(env, townId); - // TODO: Generally you should use containerFetch which waits for ports to be available - const response = await container.fetch(`http://container/agents/${agentId}/status`); + const response = await container.fetch(`http://container/agents/${agentId}/status`, { + signal: AbortSignal.timeout(5_000), + }); // 404 means the container is running but has no record of this agent // (e.g. after container eviction). Report as 'not_found' so // witnessPatrol can immediately reset and redispatch the agent // instead of waiting for the 2-hour GUPP timeout. if (response.status === 404) return { status: 'not_found' }; - if (!response.ok) return { status: 'unknown' }; + if (!response.ok) return { status: 'not_found' }; const data: unknown = await response.json(); if (typeof data === 'object' && data !== null && 'status' in data) { const status = (data as { status: unknown }).status; const exitReason = 'exitReason' in data ? (data as { exitReason: unknown }).exitReason : undefined; return { - status: typeof status === 'string' ? status : 'unknown', + status: typeof status === 'string' ? status : 'not_found', exitReason: typeof exitReason === 'string' ? exitReason : undefined, }; } - return { status: 'unknown' }; + return { status: 'not_found' }; } catch { - return { status: 'unknown' }; + // Timeout, network error, or container starting up — treat as + // not_found so zombie detection can reset the agent immediately + // rather than leaving it stuck in 'working' indefinitely. + return { status: 'not_found' }; } } From 6d14c93df5e10f0e58e6ad38abb252846a71dd85 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Wed, 18 Mar 2026 22:11:57 -0500 Subject: [PATCH 09/47] fix(gastown): clear dispatch cooldown on zombie recovery for immediate re-dispatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit witnessPatrol was setting last_activity_at = now() when resetting dead agents to idle, which triggered the 2-minute dispatch cooldown in schedulePendingWork. The cooldown exists to prevent double-dispatch of live agents with an in-flight container start — but a confirmed-dead agent has no in-flight dispatch to collide with. Set last_activity_at = NULL instead so schedulePendingWork picks up the agent on the very next alarm tick. Reduces non-review agent recovery from ~2+ min to ~5-10s after container restart. --- cloudflare-gastown/src/dos/Town.do.ts | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index b4e28e18dc..34fef3b067 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -2910,15 +2910,20 @@ export class TownDO extends DurableObject { reviewQueue.agentCompleted(this.sql, agentId, { status: 'completed' }); continue; } + // Clear last_activity_at so schedulePendingWork picks this agent + // up on the very next tick without waiting for the dispatch + // cooldown. The cooldown protects against double-dispatch of live + // agents with an in-flight start — a dead agent has no in-flight + // dispatch to collide with. query( this.sql, /* sql */ ` UPDATE ${agent_metadata} SET ${agent_metadata.columns.status} = 'idle', - ${agent_metadata.columns.last_activity_at} = ? + ${agent_metadata.columns.last_activity_at} = NULL WHERE ${agent_metadata.bead_id} = ? `, - [now(), agentId] + [agentId] ); } } From 4251f08b7fb94a1e75d06ac8950bf5cdecf04cc9 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Wed, 18 Mar 2026 22:15:39 -0500 Subject: [PATCH 10/47] docs(gastown): document DO sub-module pattern in AGENTS.md --- cloudflare-gastown/AGENTS.md | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/cloudflare-gastown/AGENTS.md b/cloudflare-gastown/AGENTS.md index 563a9a66fc..f06633c061 100644 --- a/cloudflare-gastown/AGENTS.md +++ b/cloudflare-gastown/AGENTS.md @@ -8,6 +8,37 @@ ## Durable Objects - Each DO module must export a `get{ClassName}Stub` helper function (e.g. `getRigDOStub`) that centralizes how that DO namespace creates instances. Callers should use this helper instead of accessing the namespace binding directly. +- **Sub-modules for large DOs**: When a Durable Object grows beyond a few hundred lines, extract domain logic into sub-modules under a `/` directory alongside the DO file. For example, `Town.do.ts` delegates to modules in `town/`: + + ``` + dos/ + Town.do.ts # Class definition, RPC methods, alarm loop + town/ + agents.ts # Agent CRUD, hook management + beads.ts # Bead CRUD, convoy progress + scheduling.ts # Agent dispatch, pending work scheduling + review-queue.ts # Review lifecycle, recovery + patrol.ts # Zombie detection, stale hook recovery + config.ts # Town configuration + rigs.ts # Rig registry + mail.ts # Inter-agent mail + container-dispatch.ts # Container start/stop/status + ``` + + Each sub-module exports plain functions (not classes) that accept `SqlStorage` and any other required context as arguments. The DO imports them with the `import * as X` pattern: + + ```ts + import * as beadOps from './town/beads'; + import * as agents from './town/agents'; + import * as scheduling from './town/scheduling'; + + // In the DO class: + beadOps.updateBeadStatus(this.sql, beadId, 'closed', agentId); + agents.getOrCreateAgent(this.sql, 'polecat', rigId, this.townId); + await scheduling.schedulePendingWork(this.schedulingCtx); + ``` + + This keeps the DO class thin (RPC surface + orchestration) while sub-modules own the business logic. The `import * as X` pattern makes call sites self-documenting — you can always tell which domain a function belongs to. ## IO boundaries From 627c7116790f8955c9ed6536d6f896e885019137 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Wed, 18 Mar 2026 22:44:32 -0500 Subject: [PATCH 11/47] fix(gastown): extend rehookOrphanedBeads to recover in_progress beads with unhooked agents MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit rehookOrphanedBeads previously only recovered 'open' beads. After container restarts, beads can also get stuck in 'in_progress' with a stale assignee — the agent was unhooked by witnessPatrol but the bead status was never rolled back. Neither schedulePendingWork nor any other recovery path picks these up. Now also matches in_progress beads where the assigned agent is not working and not hooked. Resets in_progress beads to open before re-hooking so the dispatch flow starts cleanly. Filters out agents in 'working' status to avoid interfering with transient races during dispatch. --- cloudflare-gastown/src/dos/town/patrol.ts | 41 +++++++++++++++-------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/cloudflare-gastown/src/dos/town/patrol.ts b/cloudflare-gastown/src/dos/town/patrol.ts index e3675eb5e1..f1d6cf30ea 100644 --- a/cloudflare-gastown/src/dos/town/patrol.ts +++ b/cloudflare-gastown/src/dos/town/patrol.ts @@ -587,40 +587,47 @@ export function feedStrandedConvoys(sql: SqlStorage, townId: string): void { } /** - * Recover open beads whose assigned agent is no longer hooked to them. + * Recover beads whose assigned agent is no longer hooked to them. * * After container restarts, review failures, or rework cycles, beads - * can end up in 'open' with a stale assignee_agent_bead_id — the agent - * has been unhooked but the bead still references it. Neither - * feedStrandedConvoys (requires assignee IS NULL) nor schedulePendingWork - * (requires agent hooked) will pick these up. + * can end up in 'open' or 'in_progress' with a stale + * assignee_agent_bead_id — the agent has been unhooked but the bead + * still references it. Neither feedStrandedConvoys (requires assignee + * IS NULL) nor schedulePendingWork (requires agent hooked) will pick + * these up. * - * For each orphaned bead, re-hook the assigned agent (or a fresh one - * if the original is busy) so schedulePendingWork dispatches it on the - * next tick. + * For each orphaned bead: + * - If in_progress, reset to open (no agent is actually working on it) + * - Hook a polecat so schedulePendingWork dispatches it on the next tick */ export function rehookOrphanedBeads(sql: SqlStorage, townId: string): void { const OrphanedBeadRow = z.object({ bead_id: z.string(), + bead_status: z.string(), rig_id: z.string().nullable(), assignee_agent_bead_id: z.string(), }); - // Find open issue beads where the assigned agent's current_hook_bead_id - // does NOT point back to this bead (either NULL or hooked elsewhere). + // Find open/in_progress issue beads where the assigned agent's + // current_hook_bead_id does NOT point back to this bead (either NULL + // or hooked elsewhere). Also require the agent to NOT be 'working' — + // if the agent is working, the hook mismatch may be a transient race + // during dispatch rather than a real orphan. const rows = OrphanedBeadRow.array().parse([ ...query( sql, /* sql */ ` SELECT ${beads.bead_id}, + ${beads.status} AS bead_status, ${beads.rig_id}, ${beads.assignee_agent_bead_id} FROM ${beads} INNER JOIN ${agent_metadata} ON ${agent_metadata.bead_id} = ${beads.assignee_agent_bead_id} - WHERE ${beads.status} = 'open' + WHERE ${beads.status} IN ('open', 'in_progress') AND ${beads.type} = 'issue' AND ${beads.assignee_agent_bead_id} IS NOT NULL + AND ${agent_metadata.status} != 'working' AND ( ${agent_metadata.current_hook_bead_id} IS NULL OR ${agent_metadata.current_hook_bead_id} != ${beads.bead_id} @@ -639,8 +646,12 @@ export function rehookOrphanedBeads(sql: SqlStorage, townId: string): void { if (!rigId) continue; try { - // Prefer re-using the original agent if it's idle+unhooked. - // Otherwise getOrCreateAgent finds or creates a fresh polecat. + // If the bead is in_progress but no agent is working on it, + // reset to open so the dispatch flow starts cleanly. + if (row.bead_status === 'in_progress') { + updateBeadStatus(sql, row.bead_id, 'open', 'system'); + } + const agent = getOrCreateAgent(sql, 'polecat', rigId, townId); hookBead(sql, agent.id, row.bead_id); query( @@ -652,7 +663,9 @@ export function rehookOrphanedBeads(sql: SqlStorage, townId: string): void { `, [agent.id] ); - console.log(`${LOG} rehookOrphanedBeads: re-hooked agent=${agent.id} to bead=${row.bead_id}`); + console.log( + `${LOG} rehookOrphanedBeads: re-hooked agent=${agent.id} to bead=${row.bead_id} (was ${row.bead_status})` + ); } catch (err) { console.warn(`${LOG} rehookOrphanedBeads: failed to re-hook bead=${row.bead_id}:`, err); } From 7b957fbeeb1b737aff18c357590ad19f7bf22086 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Wed, 18 Mar 2026 22:53:46 -0500 Subject: [PATCH 12/47] fix(gastown): close remaining recovery gaps for MR beads and orphaned source beads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Gap I: MR beads in in_progress with pr_url and a dead refinery were stuck permanently — recoverStuckReviews excludes pr_url, schedulePendingWork excludes refineries, and closeOrphanedReviewBeads only checked open beads. Expanded closeOrphanedReviewBeads to also match in_progress MR beads. Gap G: recoverOrphanedSourceBeads returned source beads to open but left the stale assignee_agent_bead_id, preventing feedStrandedConvoys (requires assignee IS NULL) from re-hooking convoy beads. Now clears the assignee so both feedStrandedConvoys and rehookOrphanedBeads can assign a fresh agent. --- .../src/dos/town/review-queue.ts | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/cloudflare-gastown/src/dos/town/review-queue.ts b/cloudflare-gastown/src/dos/town/review-queue.ts index 26c02d0c69..b8ad6e60ce 100644 --- a/cloudflare-gastown/src/dos/town/review-queue.ts +++ b/cloudflare-gastown/src/dos/town/review-queue.ts @@ -436,6 +436,12 @@ const ORPHAN_REVIEW_TIMEOUT_MS = 30 * 60 * 1000; export function closeOrphanedReviewBeads(sql: SqlStorage): void { const cutoff = new Date(Date.now() - ORPHAN_REVIEW_TIMEOUT_MS).toISOString(); + // Match MR beads with pr_url that are open OR in_progress, stale beyond + // the timeout, and whose refinery agent is idle/dead/missing. The + // in_progress case covers a gap where the refinery dies mid-review + // on a PR-strategy bead: recoverStuckReviews excludes pr_url beads + // and schedulePendingWork excludes refineries, so nothing else + // recovers them. const orphanRows = [ ...query( sql, @@ -445,7 +451,7 @@ export function closeOrphanedReviewBeads(sql: SqlStorage): void { INNER JOIN ${review_metadata} ON ${beads.bead_id} = ${review_metadata.bead_id} LEFT JOIN ${agent_metadata} ON ${beads.assignee_agent_bead_id} = ${agent_metadata.bead_id} WHERE ${beads.type} = 'merge_request' - AND ${beads.status} = 'open' + AND ${beads.status} IN ('open', 'in_progress') AND ${review_metadata.pr_url} IS NOT NULL AND ${beads.updated_at} < ? AND ( @@ -525,8 +531,20 @@ export function recoverOrphanedSourceBeads(sql: SqlStorage): void { const parsed = z.object({ source_bead_id: z.string() }).parse(row); try { updateBeadStatus(sql, parsed.source_bead_id, 'open', 'system'); + // Clear the stale assignee so feedStrandedConvoys (which requires + // assignee IS NULL) can pick up convoy beads, and rehookOrphanedBeads + // or feedStrandedConvoys can assign a fresh agent. + query( + sql, + /* sql */ ` + UPDATE ${beads} + SET ${beads.columns.assignee_agent_bead_id} = NULL + WHERE ${beads.bead_id} = ? + `, + [parsed.source_bead_id] + ); console.log( - `[review-queue] recoverOrphanedSourceBeads: returned bead=${parsed.source_bead_id} to open` + `[review-queue] recoverOrphanedSourceBeads: returned bead=${parsed.source_bead_id} to open (assignee cleared)` ); } catch (err) { console.warn( From c92318fc61712f26189b723c2e512c1c4bcfc0ea Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Wed, 18 Mar 2026 22:59:58 -0500 Subject: [PATCH 13/47] fix(gastown): use bead.bead_id instead of stale agent snapshot in dispatchAgent rollback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit dispatchAgent's failure rollback paths used agent.current_hook_bead_id to roll back the bead status. But when called from rework dispatch (completeReviewWithResult, failReviewWithRework, agentCompleted), the agent snapshot is fetched by getOrCreateAgent BEFORE hookBead is called. So agent.current_hook_bead_id is null in the snapshot, and the bead rollback is silently skipped — leaving the bead stuck in in_progress with an idle unhooked agent. Use bead.bead_id (the actual bead being dispatched) for rollback, which is always correct regardless of when the agent snapshot was taken. --- cloudflare-gastown/src/dos/town/scheduling.ts | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/cloudflare-gastown/src/dos/town/scheduling.ts b/cloudflare-gastown/src/dos/town/scheduling.ts index a250adcb41..7bc10a5315 100644 --- a/cloudflare-gastown/src/dos/town/scheduling.ts +++ b/cloudflare-gastown/src/dos/town/scheduling.ts @@ -152,7 +152,10 @@ export async function dispatchAgent( role: agent.role, }); } else { - // Container failed — roll back agent to idle, bead to open + // Container failed — roll back agent to idle, bead to open. + // Use bead.bead_id (the actual bead being dispatched) rather than + // agent.current_hook_bead_id which may be stale if the agent + // snapshot was taken before hookBead was called. query( ctx.sql, /* sql */ ` @@ -162,9 +165,7 @@ export async function dispatchAgent( `, [agent.id] ); - if (agent.current_hook_bead_id) { - beadOps.updateBeadStatus(ctx.sql, agent.current_hook_bead_id, 'open', agent.id); - } + beadOps.updateBeadStatus(ctx.sql, bead.bead_id, 'open', agent.id); ctx.emitEvent({ event: 'agent.dispatch_failed', townId: ctx.townId, @@ -188,9 +189,7 @@ export async function dispatchAgent( `, [agent.id] ); - if (agent.current_hook_bead_id) { - beadOps.updateBeadStatus(ctx.sql, agent.current_hook_bead_id, 'open', agent.id); - } + beadOps.updateBeadStatus(ctx.sql, bead.bead_id, 'open', agent.id); } catch (rollbackErr) { console.error(`${LOG} dispatchAgent: rollback also failed:`, rollbackErr); } From 5b01044683839c99624f92957d106b5512934072 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Wed, 18 Mar 2026 23:05:44 -0500 Subject: [PATCH 14/47] fix(gastown): prevent recoverStuckReviews from resetting MR beads with active refinery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit recoverStuckReviews was resetting in_progress MR beads to open every alarm tick when they exceeded the timeout, even when the refinery agent was actively working on the review. This caused infinite reset loops where the refinery's work was repeatedly interrupted. Add NOT EXISTS check to exclude MR beads whose assigned refinery agent is currently in 'working' status. Also restore the timeout to 5 min since the working-agent guard now prevents false positives — the timeout only fires when the agent is truly dead/idle. --- cloudflare-gastown/src/dos/town/review-queue.ts | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/cloudflare-gastown/src/dos/town/review-queue.ts b/cloudflare-gastown/src/dos/town/review-queue.ts index b8ad6e60ce..7e5bbad112 100644 --- a/cloudflare-gastown/src/dos/town/review-queue.ts +++ b/cloudflare-gastown/src/dos/town/review-queue.ts @@ -29,10 +29,9 @@ import { getRig } from './rigs'; import type { ReviewQueueInput, ReviewQueueEntry, AgentDoneInput, Molecule } from '../../types'; // Review entries stuck in 'running' past this timeout are reset to 'pending'. -// Set slightly above the dispatch cooldown (2 min) so that zombie -// detection + cooldown expiry have a chance to recover the agent -// before we force-reset the MR bead. -const REVIEW_RUNNING_TIMEOUT_MS = 2.5 * 60 * 1000; +// Only applies when the assigned refinery agent is NOT actively working +// (the query in recoverStuckReviews excludes working agents). +const REVIEW_RUNNING_TIMEOUT_MS = 5 * 60 * 1000; function generateId(): string { return crypto.randomUUID(); @@ -417,6 +416,11 @@ export function recoverStuckReviews(sql: SqlStorage): void { FROM ${review_metadata} WHERE ${review_metadata.pr_url} IS NOT NULL ) + AND NOT EXISTS ( + SELECT 1 FROM ${agent_metadata} + WHERE ${agent_metadata.current_hook_bead_id} = ${beads.bead_id} + AND ${agent_metadata.status} = 'working' + ) `, [now(), timeout] ); From 969fb3e8dc8d391701f1634c02399a01df682262 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Wed, 18 Mar 2026 23:21:55 -0500 Subject: [PATCH 15/47] fix(gastown): add dispatch cooldown on failure and increase MAX_DISPATCH_ATTEMPTS After a deploy + container eviction, dispatch failures burn through MAX_DISPATCH_ATTEMPTS (was 5) within 25 seconds, permanently failing beads before the container has time to start. Two changes: 1. Increase MAX_DISPATCH_ATTEMPTS from 5 to 20 to tolerate longer container cold starts. 2. Set last_activity_at = now() on dispatch failure to trigger the 2-min dispatch cooldown, giving the container time to start up before the next retry. Combined with 20 max attempts, this allows up to 40 min of retries. --- cloudflare-gastown/src/dos/town/scheduling.ts | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/cloudflare-gastown/src/dos/town/scheduling.ts b/cloudflare-gastown/src/dos/town/scheduling.ts index 7bc10a5315..66859bd6bb 100644 --- a/cloudflare-gastown/src/dos/town/scheduling.ts +++ b/cloudflare-gastown/src/dos/town/scheduling.ts @@ -23,7 +23,7 @@ const LOG = '[scheduling]'; // ── Constants ────────────────────────────────────────────────────────── export const DISPATCH_COOLDOWN_MS = 2 * 60_000; // 2 min -export const MAX_DISPATCH_ATTEMPTS = 5; +export const MAX_DISPATCH_ATTEMPTS = 20; // ── Context passed by the Town DO ────────────────────────────────────── @@ -156,14 +156,18 @@ export async function dispatchAgent( // Use bead.bead_id (the actual bead being dispatched) rather than // agent.current_hook_bead_id which may be stale if the agent // snapshot was taken before hookBead was called. + // Set last_activity_at to now() so the dispatch cooldown prevents + // immediate retry — the container may need time to start up after + // a deploy/eviction. query( ctx.sql, /* sql */ ` UPDATE ${agent_metadata} - SET ${agent_metadata.columns.status} = 'idle' + SET ${agent_metadata.columns.status} = 'idle', + ${agent_metadata.columns.last_activity_at} = ? WHERE ${agent_metadata.bead_id} = ? `, - [agent.id] + [now(), agent.id] ); beadOps.updateBeadStatus(ctx.sql, bead.bead_id, 'open', agent.id); ctx.emitEvent({ @@ -184,10 +188,11 @@ export async function dispatchAgent( ctx.sql, /* sql */ ` UPDATE ${agent_metadata} - SET ${agent_metadata.columns.status} = 'idle' + SET ${agent_metadata.columns.status} = 'idle', + ${agent_metadata.columns.last_activity_at} = ? WHERE ${agent_metadata.bead_id} = ? `, - [agent.id] + [now(), agent.id] ); beadOps.updateBeadStatus(ctx.sql, bead.bead_id, 'open', agent.id); } catch (rollbackErr) { From 811517978be3097bca8c5da91985e5cf0aaa6d6a Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Thu, 19 Mar 2026 08:35:13 -0500 Subject: [PATCH 16/47] fix(gastown): handle unhooked agent in agentDone gracefully instead of throwing 500 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the container restarts during a review, witnessPatrol unhooks the refinery agent. If the refinery process resumes and calls gt_done after being unhooked, agentDone threw 'Agent has no hooked bead' → 500 error. This triggered triage requests and left the bead in a stuck state. Return gracefully instead of throwing — the recovery paths (recoverStuckReviews, rehookOrphanedBeads) will handle the bead lifecycle. --- cloudflare-gastown/src/dos/town/review-queue.ts | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/cloudflare-gastown/src/dos/town/review-queue.ts b/cloudflare-gastown/src/dos/town/review-queue.ts index 7e5bbad112..65b3b3873f 100644 --- a/cloudflare-gastown/src/dos/town/review-queue.ts +++ b/cloudflare-gastown/src/dos/town/review-queue.ts @@ -564,7 +564,16 @@ export function recoverOrphanedSourceBeads(sql: SqlStorage): void { export function agentDone(sql: SqlStorage, agentId: string, input: AgentDoneInput): void { const agent = getAgent(sql, agentId); if (!agent) throw new Error(`Agent ${agentId} not found`); - if (!agent.current_hook_bead_id) throw new Error(`Agent ${agentId} has no hooked bead`); + if (!agent.current_hook_bead_id) { + // The agent was unhooked by a recovery path (witnessPatrol, rehookOrphanedBeads) + // between when the agent finished work and when it called gt_done. + // This is expected during container restarts. Log and return gracefully + // rather than 500ing — the recovery paths will handle the bead lifecycle. + console.warn( + `[review-queue] agentDone: agent ${agentId} has no hooked bead (likely unhooked by recovery) — ignoring` + ); + return; + } // Triage batch beads don't produce code — close and unhook without // submitting to the review queue. Only applies to system-created triage From 0d403d5f8e756e84397e907455c4c46db8621bbb Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Thu, 19 Mar 2026 08:56:18 -0500 Subject: [PATCH 17/47] fix(gastown): resolve kilocodeToken for refinery via town config fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit processReviewQueue passed rigConfig.kilocodeToken directly to startAgentInContainer, which is undefined when the token was stored in the town config rather than the rig config. Polecats work because dispatchAgent calls resolveKilocodeToken() which falls back to the town config. The refinery got no token → container rejected the start → every review failed with 'Refinery container failed to start'. Use the same resolveKilocodeToken() fallback chain. --- cloudflare-gastown/src/dos/Town.do.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index 34fef3b067..ee41d6daab 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -3305,7 +3305,7 @@ export class TownDO extends DurableObject { // may be a convoy feature branch that doesn't exist on the remote yet. // The refinery's system prompt tells it which branch to merge into. defaultBranch: rigConfig.defaultBranch, - kilocodeToken: rigConfig.kilocodeToken, + kilocodeToken: rigConfig.kilocodeToken ?? (await this.resolveKilocodeToken()), townConfig, systemPromptOverride: systemPrompt, platformIntegrationId: rigConfig.platformIntegrationId, From 6468c95547065e31ae12c8850deafb99b656848b Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Thu, 19 Mar 2026 09:15:06 -0500 Subject: [PATCH 18/47] fix(container): skip LFS smudge filter for all git operations in container MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Worktree creation fails when repos contain LFS-tracked files (e.g. .mp4) and the LFS batch endpoint can't resolve credentials. This blocks agent start entirely — the container returns a non-200 for /agents/start and processReviewQueue fails the review with 'Refinery container failed to start'. Set GIT_LFS_SKIP_SMUDGE=1 globally in the git exec helper. Agents don't need binary assets — LFS files are checked out as pointer files instead of downloading the actual content. --- cloudflare-gastown/container/src/git-manager.ts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cloudflare-gastown/container/src/git-manager.ts b/cloudflare-gastown/container/src/git-manager.ts index 2d154d8769..a65d34370b 100644 --- a/cloudflare-gastown/container/src/git-manager.ts +++ b/cloudflare-gastown/container/src/git-manager.ts @@ -191,6 +191,11 @@ async function exec(cmd: string, args: string[], cwd?: string): Promise // Public repos clone without auth; private repos fail fast with // a clear error instead of hanging on a username prompt. GIT_TERMINAL_PROMPT: '0', + // Skip LFS smudge filter during checkout/worktree operations. + // Agents don't need binary assets (videos, images, etc.) and + // LFS downloads can fail when the credential helper doesn't + // cover the LFS batch endpoint, blocking worktree creation. + GIT_LFS_SKIP_SMUDGE: '1', }, }); From b2a4e196d94a6de9d4e313817dda50dba6a7d62e Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Thu, 19 Mar 2026 09:24:34 -0500 Subject: [PATCH 19/47] fix(container): add global .gitconfig to skip LFS smudge for agent user Belt-and-suspenders alongside the GIT_LFS_SKIP_SMUDGE env var: install a .gitconfig for the agent user that configures the LFS filter to skip smudge and excludes all files from LFS fetch. This persists in the Docker image layer and covers any code path that runs git outside the exec() helper. --- cloudflare-gastown/container/Dockerfile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cloudflare-gastown/container/Dockerfile b/cloudflare-gastown/container/Dockerfile index 4c653a7485..0916754f59 100644 --- a/cloudflare-gastown/container/Dockerfile +++ b/cloudflare-gastown/container/Dockerfile @@ -44,6 +44,14 @@ RUN cd /opt/gastown-plugin && npm install --omit=dev && \ ln -s /opt/gastown-plugin/index.ts /home/agent/.config/kilo/plugins/gastown.ts && \ chown -R agent:agent /home/agent/.config +# ── Git config for agent user ─────────────────────────────────────── +# Skip LFS smudge filter: agents don't need binary assets and LFS +# downloads can fail when credentials don't cover the batch endpoint. +# Also disable LFS fetch entirely so clone/worktree never stalls. +RUN printf '[filter "lfs"]\n\tsmudge = git-lfs smudge --skip -- %%f\n\tprocess = git-lfs filter-process --skip\n\tclean = git-lfs clean -- %%f\n\trequired = true\n[lfs]\n\tfetchexclude = *\n' \ + > /home/agent/.gitconfig && \ + chown agent:agent /home/agent/.gitconfig + WORKDIR /app # ── Install production deps via pnpm ──────────────────────────────── From 5a158f172404d52bd436abb2a8c079af676ab1f3 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Thu, 19 Mar 2026 09:35:40 -0500 Subject: [PATCH 20/47] fix(gastown): prevent false zombie detection from resetting active refinery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three related fixes to stop recoverStuckReviews from resetting MR beads while the refinery is actively reviewing: 1. checkAgentContainerStatus: restore 'unknown' for non-404 errors and timeouts instead of aggressive 'not_found'. Only 404 (container confirms agent doesn't exist) triggers zombie reset. Timeout/errors return 'unknown' which witnessPatrol ignores — the GUPP system handles truly dead agents after 2 hours. 2. recoverStuckReviews: exclude MR beads where ANY agent is hooked (not just 'working' agents). After witnessPatrol resets a refinery to idle but keeps the hook, the old check saw 'no working agent' and reset the MR bead while the refinery was about to be re-dispatched. 3. Increase REVIEW_RUNNING_TIMEOUT_MS from 5 min to 15 min. Reviews legitimately take 5-10 min (clone + review + test + merge). --- .../src/dos/town/container-dispatch.ts | 20 +++++++++++-------- .../src/dos/town/review-queue.ts | 7 +++---- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/cloudflare-gastown/src/dos/town/container-dispatch.ts b/cloudflare-gastown/src/dos/town/container-dispatch.ts index 3098c9cea3..37df08c7bd 100644 --- a/cloudflare-gastown/src/dos/town/container-dispatch.ts +++ b/cloudflare-gastown/src/dos/town/container-dispatch.ts @@ -536,30 +536,34 @@ export async function checkAgentContainerStatus( try { const container = getTownContainerStub(env, townId); const response = await container.fetch(`http://container/agents/${agentId}/status`, { - signal: AbortSignal.timeout(5_000), + signal: AbortSignal.timeout(10_000), }); // 404 means the container is running but has no record of this agent // (e.g. after container eviction). Report as 'not_found' so // witnessPatrol can immediately reset and redispatch the agent // instead of waiting for the 2-hour GUPP timeout. if (response.status === 404) return { status: 'not_found' }; - if (!response.ok) return { status: 'not_found' }; + // Non-OK but not 404 — container is having issues but may still + // have the agent running. Return 'unknown' so witnessPatrol doesn't + // falsely reset a working agent. + if (!response.ok) return { status: 'unknown' }; const data: unknown = await response.json(); if (typeof data === 'object' && data !== null && 'status' in data) { const status = (data as { status: unknown }).status; const exitReason = 'exitReason' in data ? (data as { exitReason: unknown }).exitReason : undefined; return { - status: typeof status === 'string' ? status : 'not_found', + status: typeof status === 'string' ? status : 'unknown', exitReason: typeof exitReason === 'string' ? exitReason : undefined, }; } - return { status: 'not_found' }; + return { status: 'unknown' }; } catch { - // Timeout, network error, or container starting up — treat as - // not_found so zombie detection can reset the agent immediately - // rather than leaving it stuck in 'working' indefinitely. - return { status: 'not_found' }; + // Timeout, network error, or container starting up — return + // 'unknown' so witnessPatrol doesn't falsely reset working agents. + // True zombies will be caught after repeated 'unknown' results + // once the GIPP/heartbeat timeout expires. + return { status: 'unknown' }; } } diff --git a/cloudflare-gastown/src/dos/town/review-queue.ts b/cloudflare-gastown/src/dos/town/review-queue.ts index 65b3b3873f..e01bc6bfa3 100644 --- a/cloudflare-gastown/src/dos/town/review-queue.ts +++ b/cloudflare-gastown/src/dos/town/review-queue.ts @@ -29,9 +29,9 @@ import { getRig } from './rigs'; import type { ReviewQueueInput, ReviewQueueEntry, AgentDoneInput, Molecule } from '../../types'; // Review entries stuck in 'running' past this timeout are reset to 'pending'. -// Only applies when the assigned refinery agent is NOT actively working -// (the query in recoverStuckReviews excludes working agents). -const REVIEW_RUNNING_TIMEOUT_MS = 5 * 60 * 1000; +// Only applies when no agent (working or idle) is hooked to the MR bead. +// Set to 15 min to give the refinery ample time for clone + review + merge. +const REVIEW_RUNNING_TIMEOUT_MS = 15 * 60 * 1000; function generateId(): string { return crypto.randomUUID(); @@ -419,7 +419,6 @@ export function recoverStuckReviews(sql: SqlStorage): void { AND NOT EXISTS ( SELECT 1 FROM ${agent_metadata} WHERE ${agent_metadata.current_hook_bead_id} = ${beads.bead_id} - AND ${agent_metadata.status} = 'working' ) `, [now(), timeout] From 480489847ceab085889c07b39d6c966c0572eb6b Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Thu, 19 Mar 2026 09:51:45 -0500 Subject: [PATCH 21/47] fix(gastown): route dead agents through agentCompleted for proper bead lifecycle witnessPatrol was resetting dead agents to idle without handling the bead lifecycle. For refineries, this left MR beads stuck in in_progress with an idle+hooked refinery that schedulePendingWork ignores (refinery exclusion). The refinery's gt_done would also fail silently (agent already unhooked by recovery). Now route ALL dead agents through agentCompleted, which properly: - For refineries: fails the MR bead and returns the source bead to in_progress for rework dispatch - For polecats: fails/closes the hooked bead - For completed exits: closes the bead normally This ensures the bead lifecycle is always properly transitioned when an agent dies, regardless of role. --- cloudflare-gastown/src/dos/Town.do.ts | 55 ++++++++++++++++++--------- 1 file changed, 37 insertions(+), 18 deletions(-) diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index ee41d6daab..2f8eca88d5 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -2906,25 +2906,44 @@ export class TownDO extends DurableObject { const { agentId, containerInfo } = result.value; if (containerInfo.status === 'not_found' || containerInfo.status === 'exited') { - if (containerInfo.exitReason === 'completed') { - reviewQueue.agentCompleted(this.sql, agentId, { status: 'completed' }); - continue; + // Route ALL dead agents through agentCompleted so the bead + // lifecycle is properly handled. For refineries, this triggers + // the rework flow (MR bead failed → source bead back to + // in_progress). For polecats, it fails/closes the bead. + // 'completed' exit means normal termination; 'not_found' or + // no exit reason means the process died unexpectedly. + const status = containerInfo.exitReason === 'completed' ? 'completed' : 'failed'; + const result = reviewQueue.agentCompleted(this.sql, agentId, { + status, + reason: status === 'failed' ? 'Agent process died (container restart or crash)' : undefined, + }); + + // For refinery rework: dispatch a polecat to re-work the source bead + if (result.reworkSourceBeadId) { + const sourceBead = beadOps.getBead(this.sql, result.reworkSourceBeadId); + if (sourceBead?.rig_id) { + try { + const reworkAgent = agents.getOrCreateAgent( + this.sql, + 'polecat', + sourceBead.rig_id, + this.townId + ); + agents.hookBead(this.sql, reworkAgent.id, result.reworkSourceBeadId); + this.dispatchAgent(reworkAgent, sourceBead).catch(err => + console.error( + `${TOWN_LOG} witnessPatrol: rework dispatch failed for bead=${result.reworkSourceBeadId}`, + err + ) + ); + } catch (err) { + console.warn( + `${TOWN_LOG} witnessPatrol: could not dispatch rework for bead=${result.reworkSourceBeadId}:`, + err + ); + } + } } - // Clear last_activity_at so schedulePendingWork picks this agent - // up on the very next tick without waiting for the dispatch - // cooldown. The cooldown protects against double-dispatch of live - // agents with an in-flight start — a dead agent has no in-flight - // dispatch to collide with. - query( - this.sql, - /* sql */ ` - UPDATE ${agent_metadata} - SET ${agent_metadata.columns.status} = 'idle', - ${agent_metadata.columns.last_activity_at} = NULL - WHERE ${agent_metadata.bead_id} = ? - `, - [agentId] - ); } } From b6c087389b17258055c2c5a2b3fa69d4e18c46e6 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Thu, 19 Mar 2026 10:08:59 -0500 Subject: [PATCH 22/47] fix(gastown): don't reopen closed source beads when a stale MR bead fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When multiple MR beads exist for the same source bead (from repeated rework cycles), a successful merge closes the source bead. But if a STALE MR bead from a previous cycle then fails (e.g. refinery container failed to start), completeReviewWithResult(status:'failed') was blindly setting the source bead back to in_progress — even though it was already closed by the merged MR. Check the source bead's current status before transitioning. If already closed or failed, skip the in_progress transition. This prevents the endless cycle: open → in_progress → in_review → closed → in_progress (from stale MR) → open → repeat. --- cloudflare-gastown/src/dos/town/review-queue.ts | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/cloudflare-gastown/src/dos/town/review-queue.ts b/cloudflare-gastown/src/dos/town/review-queue.ts index e01bc6bfa3..542b6e9ded 100644 --- a/cloudflare-gastown/src/dos/town/review-queue.ts +++ b/cloudflare-gastown/src/dos/town/review-queue.ts @@ -314,11 +314,20 @@ export function completeReviewWithResult( }); // Return source bead to in_progress so the polecat can be re-dispatched // to resolve the conflict (in_review → in_progress rework flow). - updateBeadStatus(sql, entry.bead_id, 'in_progress', entry.agent_id); + // Skip if source bead already reached a terminal state. + const conflictSourceBead = getBead(sql, entry.bead_id); + if (conflictSourceBead && conflictSourceBead.status !== 'closed' && conflictSourceBead.status !== 'failed') { + updateBeadStatus(sql, entry.bead_id, 'in_progress', entry.agent_id); + } } else if (input.status === 'failed') { // Review failed (rework requested): return source bead to in_progress // so it can be re-dispatched (in_review → in_progress rework flow). - updateBeadStatus(sql, entry.bead_id, 'in_progress', entry.agent_id); + // BUT only if the source bead hasn't already reached a terminal state + // (e.g. closed by a different MR bead that merged successfully). + const sourceBead = getBead(sql, entry.bead_id); + if (sourceBead && sourceBead.status !== 'closed' && sourceBead.status !== 'failed') { + updateBeadStatus(sql, entry.bead_id, 'in_progress', entry.agent_id); + } } } From 5d80215a32b6f4e1759836db9bbe8818ef1ad3eb Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Thu, 19 Mar 2026 10:39:58 -0500 Subject: [PATCH 23/47] fix(gastown): add diagnostic logging for refinery dispatch failures --- cloudflare-gastown/src/dos/Town.do.ts | 18 +++++++++++++++++- .../src/dos/town/container-dispatch.ts | 5 ++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index 2f8eca88d5..47e5be1925 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -3334,8 +3334,24 @@ export class TownDO extends DurableObject { agents.unhookBead(this.sql, refineryAgent.id); agents.updateAgentStatus(this.sql, refineryAgent.id, 'idle'); console.error( - `${TOWN_LOG} processReviewQueue: refinery agent failed to start for entry=${entry.id}` + `${TOWN_LOG} processReviewQueue: refinery agent failed to start for entry=${entry.id} ` + + `rigId=${rigId} agentId=${refineryAgent.id} agentName=${refineryAgent.name} ` + + `branch=${entry.branch} kilocodeToken=${!!(rigConfig.kilocodeToken ?? (await this.resolveKilocodeToken()))}` ); + // Log the failure as a bead event so it's visible in the admin dashboard + beadOps.logBeadEvent(this.sql, { + beadId: entry.id, + agentId: refineryAgent.id, + eventType: 'status_changed', + newValue: 'dispatch_failed', + metadata: { + reason: 'container_start_failed', + rigId, + branch: entry.branch, + targetBranch, + mergeStrategy: effectiveMergeStrategy, + }, + }); this.failReviewWithRework(entry, 'Refinery container failed to start'); } } diff --git a/cloudflare-gastown/src/dos/town/container-dispatch.ts b/cloudflare-gastown/src/dos/town/container-dispatch.ts index 37df08c7bd..6292a1112b 100644 --- a/cloudflare-gastown/src/dos/town/container-dispatch.ts +++ b/cloudflare-gastown/src/dos/town/container-dispatch.ts @@ -429,7 +429,10 @@ export async function startAgentInContainer( if (!response.ok) { const text = await response.text().catch(() => '(unreadable)'); - console.error(`${TOWN_LOG} startAgentInContainer: error response: ${text.slice(0, 500)}`); + console.error( + `${TOWN_LOG} startAgentInContainer: error response (${response.status}) for ` + + `agent=${params.agentId} role=${params.role}: ${text.slice(0, 500)}` + ); } return response.ok; } catch (err) { From d7c2c4cccbc5c91301d5dd4f5299a9455cfe17cc Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Thu, 19 Mar 2026 10:54:10 -0500 Subject: [PATCH 24/47] fix(gastown): recover refinery gt_done when agent was unhooked by zombie detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The critical bug: when witnessPatrol detects the refinery container process as not_found (during status check races), it unhooks the refinery. When the refinery then calls gt_done after successfully merging, agentDone found no hooked bead and silently returned — dropping the merge result. The source bead stayed open forever, cycling endlessly. Now when an unhooked refinery calls gt_done, find the most recent non-closed MR bead assigned to it and complete the review. This ensures the merge is recorded even when zombie detection races with gt_done. --- .../src/dos/town/review-queue.ts | 57 +++++++++++++++++-- 1 file changed, 52 insertions(+), 5 deletions(-) diff --git a/cloudflare-gastown/src/dos/town/review-queue.ts b/cloudflare-gastown/src/dos/town/review-queue.ts index 542b6e9ded..4c3d831a28 100644 --- a/cloudflare-gastown/src/dos/town/review-queue.ts +++ b/cloudflare-gastown/src/dos/town/review-queue.ts @@ -573,12 +573,59 @@ export function agentDone(sql: SqlStorage, agentId: string, input: AgentDoneInpu const agent = getAgent(sql, agentId); if (!agent) throw new Error(`Agent ${agentId} not found`); if (!agent.current_hook_bead_id) { - // The agent was unhooked by a recovery path (witnessPatrol, rehookOrphanedBeads) - // between when the agent finished work and when it called gt_done. - // This is expected during container restarts. Log and return gracefully - // rather than 500ing — the recovery paths will handle the bead lifecycle. + // The agent was unhooked by a recovery path (witnessPatrol, + // rehookOrphanedBeads) between when the agent finished work and + // when it called gt_done. + // + // For refineries, this is critical: the refinery successfully merged + // but the hook was cleared by zombie detection. We MUST still complete + // the review — otherwise the source bead stays open forever. Find the + // most recent non-closed MR bead assigned to this agent and complete it. + if (agent.role === 'refinery') { + const recentMrRows = [ + ...query( + sql, + /* sql */ ` + SELECT ${beads.bead_id} + FROM ${beads} + WHERE ${beads.type} = 'merge_request' + AND ${beads.assignee_agent_bead_id} = ? + AND ${beads.status} NOT IN ('closed', 'failed') + ORDER BY ${beads.updated_at} DESC + LIMIT 1 + `, + [agentId] + ), + ]; + if (recentMrRows.length > 0) { + const mrBeadId = z.object({ bead_id: z.string() }).parse(recentMrRows[0]).bead_id; + console.log( + `[review-queue] agentDone: unhooked refinery ${agentId} — recovering MR bead ${mrBeadId}` + ); + if (input.pr_url) { + const stored = setReviewPrUrl(sql, mrBeadId, input.pr_url); + if (stored) { + markReviewInReview(sql, mrBeadId); + } else { + completeReviewWithResult(sql, { + entry_id: mrBeadId, + status: 'failed', + message: `Refinery provided invalid pr_url: ${input.pr_url}`, + }); + } + } else { + completeReviewWithResult(sql, { + entry_id: mrBeadId, + status: 'merged', + message: input.summary ?? 'Merged by refinery agent (recovered from unhook)', + }); + } + return; + } + } + console.warn( - `[review-queue] agentDone: agent ${agentId} has no hooked bead (likely unhooked by recovery) — ignoring` + `[review-queue] agentDone: agent ${agentId} (role=${agent.role}) has no hooked bead — ignoring` ); return; } From 9f0fcd31767b76802cdadff5ddf0c904b6f65ebc Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Thu, 19 Mar 2026 11:15:00 -0500 Subject: [PATCH 25/47] fix(gastown): enforce terminal state immutability and simplify zombie recovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three fundamental changes to prevent beads from being reopened after reaching closed/failed: 1. updateBeadStatus: HARD INVARIANT — block all transitions out of closed/failed. Returns the bead as-is (no-op). This single guard makes it impossible for stale MR failures, recovery functions, or race conditions to reopen a terminal bead. 2. completeReview: same guard for the direct SQL path that bypasses updateBeadStatus. 3. witnessPatrol zombie detection: on abnormal agent death (container restart/crash), DON'T fail the bead. Just reset agent to idle. For polecats: keep hook so schedulePendingWork re-dispatches. For refineries: unhook so recoverStuckReviews handles the MR bead after timeout. This avoids false bead failures during container restarts and lets the refinery's gt_done succeed if the merge already happened. 4. rehookOrphanedBeads: add 2-minute time guard to avoid interfering with in-flight transitions (dispatch, gt_done, review completion). --- cloudflare-gastown/src/dos/Town.do.ts | 63 +++++++++---------- cloudflare-gastown/src/dos/town/beads.ts | 11 ++++ cloudflare-gastown/src/dos/town/patrol.ts | 8 ++- .../src/dos/town/review-queue.ts | 10 +++ 4 files changed, 55 insertions(+), 37 deletions(-) diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index 47e5be1925..4b8f4e1851 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -2906,43 +2906,36 @@ export class TownDO extends DurableObject { const { agentId, containerInfo } = result.value; if (containerInfo.status === 'not_found' || containerInfo.status === 'exited') { - // Route ALL dead agents through agentCompleted so the bead - // lifecycle is properly handled. For refineries, this triggers - // the rework flow (MR bead failed → source bead back to - // in_progress). For polecats, it fails/closes the bead. - // 'completed' exit means normal termination; 'not_found' or - // no exit reason means the process died unexpectedly. - const status = containerInfo.exitReason === 'completed' ? 'completed' : 'failed'; - const result = reviewQueue.agentCompleted(this.sql, agentId, { - status, - reason: status === 'failed' ? 'Agent process died (container restart or crash)' : undefined, - }); + const agent = agents.getAgent(this.sql, agentId); + if (!agent) continue; - // For refinery rework: dispatch a polecat to re-work the source bead - if (result.reworkSourceBeadId) { - const sourceBead = beadOps.getBead(this.sql, result.reworkSourceBeadId); - if (sourceBead?.rig_id) { - try { - const reworkAgent = agents.getOrCreateAgent( - this.sql, - 'polecat', - sourceBead.rig_id, - this.townId - ); - agents.hookBead(this.sql, reworkAgent.id, result.reworkSourceBeadId); - this.dispatchAgent(reworkAgent, sourceBead).catch(err => - console.error( - `${TOWN_LOG} witnessPatrol: rework dispatch failed for bead=${result.reworkSourceBeadId}`, - err - ) - ); - } catch (err) { - console.warn( - `${TOWN_LOG} witnessPatrol: could not dispatch rework for bead=${result.reworkSourceBeadId}:`, - err - ); - } + if (containerInfo.exitReason === 'completed') { + // Normal exit — route through agentCompleted for proper lifecycle + reviewQueue.agentCompleted(this.sql, agentId, { status: 'completed' }); + } else { + // Abnormal death (container restart, OOM, crash). + // DON'T fail the bead — just reset the agent to idle. + // For polecats: keep hook so schedulePendingWork re-dispatches + // For refineries: unhook (recoverStuckReviews handles MR bead + // after timeout; the refinery may still call gt_done if the + // merge succeeded before the process died) + if (agent.role === 'refinery') { + agents.unhookBead(this.sql, agentId); } + query( + this.sql, + /* sql */ ` + UPDATE ${agent_metadata} + SET ${agent_metadata.columns.status} = 'idle', + ${agent_metadata.columns.dispatch_attempts} = 0 + WHERE ${agent_metadata.bead_id} = ? + `, + [agentId] + ); + console.log( + `${TOWN_LOG} witnessPatrol: agent ${agentId} (${agent.role}) died abnormally — ` + + `reset to idle, bead status preserved for normal recovery` + ); } } } diff --git a/cloudflare-gastown/src/dos/town/beads.ts b/cloudflare-gastown/src/dos/town/beads.ts index cf0f009346..0095b444e6 100644 --- a/cloudflare-gastown/src/dos/town/beads.ts +++ b/cloudflare-gastown/src/dos/town/beads.ts @@ -258,6 +258,17 @@ export function updateBeadStatus( // No-op if already in the target status — avoids redundant events if (bead.status === status) return bead; + // HARD INVARIANT: terminal states (closed/failed) are immutable. + // Once a bead reaches a terminal state, no recovery function, stale MR + // failure, or race condition should ever change its status. Return the + // bead as-is (no-op, not an error) so callers don't need to pre-check. + if (bead.status === 'closed' || bead.status === 'failed') { + console.warn( + `[beads] updateBeadStatus: blocked ${bead.status} → ${status} for bead=${beadId} — terminal state is immutable` + ); + return bead; + } + const oldStatus = bead.status; const timestamp = now(); const closedAt = status === 'closed' ? timestamp : bead.closed_at; diff --git a/cloudflare-gastown/src/dos/town/patrol.ts b/cloudflare-gastown/src/dos/town/patrol.ts index f1d6cf30ea..4e18cbd6f2 100644 --- a/cloudflare-gastown/src/dos/town/patrol.ts +++ b/cloudflare-gastown/src/dos/town/patrol.ts @@ -612,7 +612,10 @@ export function rehookOrphanedBeads(sql: SqlStorage, townId: string): void { // current_hook_bead_id does NOT point back to this bead (either NULL // or hooked elsewhere). Also require the agent to NOT be 'working' — // if the agent is working, the hook mismatch may be a transient race - // during dispatch rather than a real orphan. + // during dispatch rather than a real orphan. Time guard: only touch + // beads orphaned for >2 min to avoid interfering with in-flight + // transitions (dispatch, gt_done, review completion). + const cutoff = new Date(Date.now() - 2 * 60_000).toISOString(); const rows = OrphanedBeadRow.array().parse([ ...query( sql, @@ -627,13 +630,14 @@ export function rehookOrphanedBeads(sql: SqlStorage, townId: string): void { WHERE ${beads.status} IN ('open', 'in_progress') AND ${beads.type} = 'issue' AND ${beads.assignee_agent_bead_id} IS NOT NULL + AND ${beads.updated_at} < ? AND ${agent_metadata.status} != 'working' AND ( ${agent_metadata.current_hook_bead_id} IS NULL OR ${agent_metadata.current_hook_bead_id} != ${beads.bead_id} ) `, - [] + [cutoff] ), ]); diff --git a/cloudflare-gastown/src/dos/town/review-queue.ts b/cloudflare-gastown/src/dos/town/review-queue.ts index 4c3d831a28..e59f9cd593 100644 --- a/cloudflare-gastown/src/dos/town/review-queue.ts +++ b/cloudflare-gastown/src/dos/town/review-queue.ts @@ -225,6 +225,16 @@ export function completeReview( entryId: string, status: 'merged' | 'failed' ): void { + // Guard: don't overwrite terminal states (closed MR bead that was + // already merged should never be set to 'failed' by a stale call) + const current = getBead(sql, entryId); + if (current && (current.status === 'closed' || current.status === 'failed')) { + console.warn( + `[review-queue] completeReview: bead ${entryId} already ${current.status}, skipping` + ); + return; + } + const beadStatus = status === 'merged' ? 'closed' : 'failed'; const timestamp = now(); query( From b78359728d7d2357c5d0af4f97a3d8e2dc0f35f5 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Thu, 19 Mar 2026 11:24:08 -0500 Subject: [PATCH 26/47] debug: add temporary debugAgentMetadata endpoint --- cloudflare-gastown/src/dos/Town.do.ts | 19 +++++++++++++++++++ cloudflare-gastown/src/trpc/router.ts | 8 ++++++++ 2 files changed, 27 insertions(+) diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index 4b8f4e1851..ad9e65fba5 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -4041,6 +4041,25 @@ export class TownDO extends DurableObject { }; } + // DEBUG: raw agent_metadata dump — remove after debugging + async debugAgentMetadata(): Promise { + return [ + ...query( + this.sql, + /* sql */ ` + SELECT ${agent_metadata.bead_id}, + ${agent_metadata.role}, + ${agent_metadata.status}, + ${agent_metadata.current_hook_bead_id}, + ${agent_metadata.dispatch_attempts}, + ${agent_metadata.last_activity_at} + FROM ${agent_metadata} + `, + [] + ), + ]; + } + async destroy(): Promise { console.log(`${TOWN_LOG} destroy: clearing all storage and alarms`); diff --git a/cloudflare-gastown/src/trpc/router.ts b/cloudflare-gastown/src/trpc/router.ts index 420b52f69f..6d88f34422 100644 --- a/cloudflare-gastown/src/trpc/router.ts +++ b/cloudflare-gastown/src/trpc/router.ts @@ -1221,6 +1221,14 @@ export const gastownRouter = router({ const townStub = getTownDOStub(ctx.env, input.townId); return townStub.getBeadAsync(input.beadId); }), + + // DEBUG: raw agent_metadata dump — remove after debugging + debugAgentMetadata: adminProcedure + .input(z.object({ townId: z.string().uuid() })) + .query(async ({ ctx, input }) => { + const townStub = getTownDOStub(ctx.env, input.townId); + return townStub.debugAgentMetadata(); + }), }); export type GastownRouter = typeof gastownRouter; From 319ddef43b056c24baeaaaacdc22f5a2c15983e6 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Thu, 19 Mar 2026 11:33:26 -0500 Subject: [PATCH 27/47] fix(gastown): fix Zod parse failure in schedulePendingWork that silently broke all dispatching MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SQL query in schedulePendingWork used 'agent_metadata.status AS status' which overwrote 'beads.status' in the result row. beads.status is an enum of open/in_progress/in_review/closed/failed, but agent_metadata.status is idle/working/stalled/dead. The AgentBeadRecord Zod schema rejected 'idle' as an invalid bead status, causing .parse() to throw silently inside the Promise.allSettled catch handler. Result: schedulePendingWork has been a complete no-op since the scheduling module was created. No agents were ever dispatched by the alarm loop — only fire-and-forget dispatches from slingBead/slingConvoy worked. Fix: alias agent_metadata.status as 'agent_status' to avoid overwriting beads.status, and use safeParse with error logging. --- cloudflare-gastown/src/dos/town/scheduling.ts | 45 +++++++++++-------- 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/cloudflare-gastown/src/dos/town/scheduling.ts b/cloudflare-gastown/src/dos/town/scheduling.ts index 66859bd6bb..348d9915bf 100644 --- a/cloudflare-gastown/src/dos/town/scheduling.ts +++ b/cloudflare-gastown/src/dos/town/scheduling.ts @@ -7,6 +7,7 @@ */ import * as Sentry from '@sentry/cloudflare'; +import { z } from 'zod'; import { beads, AgentBeadRecord } from '../../db/tables/beads.table'; import { agent_metadata } from '../../db/tables/agent-metadata.table'; import { query } from '../../util/query.util'; @@ -260,7 +261,7 @@ export async function schedulePendingWork(ctx: SchedulingContext): Promise SELECT ${beads}.*, ${agent_metadata.role}, ${agent_metadata.identity}, ${agent_metadata.container_process_id}, - ${agent_metadata.status} AS status, + ${agent_metadata.status} AS agent_status, ${agent_metadata.current_hook_bead_id}, ${agent_metadata.dispatch_attempts}, ${agent_metadata.last_activity_at}, ${agent_metadata.checkpoint}, @@ -275,23 +276,31 @@ export async function schedulePendingWork(ctx: SchedulingContext): Promise [cooldownCutoff] ), ]; - const pendingAgents: Agent[] = AgentBeadRecord.array() - .parse(rows) - .map(row => ({ - id: row.bead_id, - rig_id: row.rig_id, - role: row.role, - name: row.title, - identity: row.identity, - status: row.status, - current_hook_bead_id: row.current_hook_bead_id, - dispatch_attempts: row.dispatch_attempts, - last_activity_at: row.last_activity_at, - checkpoint: row.checkpoint, - created_at: row.created_at, - agent_status_message: row.agent_status_message, - agent_status_updated_at: row.agent_status_updated_at, - })); + // Parse rows as AgentBeadRecord — the agent_metadata.status is aliased + // as agent_status (not status) to avoid overwriting beads.status which + // has a different enum (open/in_progress/... vs idle/working/...). + const parsed = z + .array(AgentBeadRecord.extend({ agent_status: z.string() })) + .safeParse(rows); + if (!parsed.success) { + console.error(`${LOG} schedulePendingWork: Zod parse failed:`, parsed.error.issues.slice(0, 3)); + return; + } + const pendingAgents: Agent[] = parsed.data.map(row => ({ + id: row.bead_id, + rig_id: row.rig_id, + role: row.role, + name: row.title, + identity: row.identity, + status: row.agent_status, + current_hook_bead_id: row.current_hook_bead_id, + dispatch_attempts: row.dispatch_attempts, + last_activity_at: row.last_activity_at, + checkpoint: row.checkpoint, + created_at: row.created_at, + agent_status_message: row.agent_status_message, + agent_status_updated_at: row.agent_status_updated_at, + })); console.log(`${LOG} schedulePendingWork: found ${pendingAgents.length} pending agents`); if (pendingAgents.length === 0) return; From c966dc03f2a5726e5093d01b0d15f2ce99e57425 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Thu, 19 Mar 2026 11:46:20 -0500 Subject: [PATCH 28/47] debug: capture container start error on refinery agent status message --- cloudflare-gastown/src/dos/Town.do.ts | 25 ++++++------------- .../src/dos/town/container-dispatch.ts | 17 +++++++++++-- 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index ad9e65fba5..46da14fb59 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -3326,25 +3326,16 @@ export class TownDO extends DurableObject { if (!started) { agents.unhookBead(this.sql, refineryAgent.id); agents.updateAgentStatus(this.sql, refineryAgent.id, 'idle'); + + const containerError = dispatch.getLastStartError() ?? 'unknown'; + const errorCtx = `entry=${entry.id} rigId=${rigId} branch=${entry.branch} ` + + `containerError=${containerError}`; console.error( - `${TOWN_LOG} processReviewQueue: refinery agent failed to start for entry=${entry.id} ` + - `rigId=${rigId} agentId=${refineryAgent.id} agentName=${refineryAgent.name} ` + - `branch=${entry.branch} kilocodeToken=${!!(rigConfig.kilocodeToken ?? (await this.resolveKilocodeToken()))}` + `${TOWN_LOG} processReviewQueue: refinery failed to start: ${errorCtx}` ); - // Log the failure as a bead event so it's visible in the admin dashboard - beadOps.logBeadEvent(this.sql, { - beadId: entry.id, - agentId: refineryAgent.id, - eventType: 'status_changed', - newValue: 'dispatch_failed', - metadata: { - reason: 'container_start_failed', - rigId, - branch: entry.branch, - targetBranch, - mergeStrategy: effectiveMergeStrategy, - }, - }); + agents.updateAgentStatusMessage(this.sql, refineryAgent.id, + `[dispatch_failed] ${containerError}`); + this.failReviewWithRework(entry, 'Refinery container failed to start'); } } diff --git a/cloudflare-gastown/src/dos/town/container-dispatch.ts b/cloudflare-gastown/src/dos/town/container-dispatch.ts index 6292a1112b..96acc42330 100644 --- a/cloudflare-gastown/src/dos/town/container-dispatch.ts +++ b/cloudflare-gastown/src/dos/town/container-dispatch.ts @@ -12,6 +12,13 @@ import { buildContainerConfig, resolveModel, resolveSmallModel } from './config' const TOWN_LOG = '[Town.do]'; +// Module-level diagnostic: stores the last container start error so +// callers can surface it via the admin API. Reset on each call. +let lastStartError: string | null = null; +export function getLastStartError(): string | null { + return lastStartError; +} + /** * Resolve the GASTOWN_JWT_SECRET binding to a string. */ @@ -304,6 +311,7 @@ export async function startAgentInContainer( }>; } ): Promise { + lastStartError = null; console.log( `${TOWN_LOG} startAgentInContainer: agentId=${params.agentId} role=${params.role} name=${params.agentName}` ); @@ -429,14 +437,19 @@ export async function startAgentInContainer( if (!response.ok) { const text = await response.text().catch(() => '(unreadable)'); + const errorMsg = `(${response.status}) ${text.slice(0, 300)}`; console.error( - `${TOWN_LOG} startAgentInContainer: error response (${response.status}) for ` + - `agent=${params.agentId} role=${params.role}: ${text.slice(0, 500)}` + `${TOWN_LOG} startAgentInContainer: error response for ` + + `agent=${params.agentId} role=${params.role}: ${errorMsg}` ); + // Store error on a well-known key so the caller can read it + lastStartError = errorMsg; } return response.ok; } catch (err) { + const message = err instanceof Error ? err.message : String(err); console.error(`${TOWN_LOG} startAgentInContainer: EXCEPTION for agent ${params.agentId}:`, err); + lastStartError = `EXCEPTION: ${message.slice(0, 300)}`; return false; } } From 0d05bd2c1ecf97b8b5b3ae1485ac2a66e56806db Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Thu, 19 Mar 2026 11:59:53 -0500 Subject: [PATCH 29/47] fix(gastown): close stale MR beads when one MR merges for the same source When a review merges, close ALL other MR beads for the same source bead (open, in_progress, or failed). During rework cycles, multiple MR beads accumulate. Without cleanup, processReviewQueue pops a stale MR on the next alarm tick, fails to start the refinery (already exited), and failReviewWithRework reopens the source bead that was just closed. Also fixes agents working on open beads: the terminal state guard in updateBeadStatus prevents closed beads from being reopened, and the stale MR cleanup prevents the rework cascade that triggers the reopen. --- .../src/dos/town/review-queue.ts | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/cloudflare-gastown/src/dos/town/review-queue.ts b/cloudflare-gastown/src/dos/town/review-queue.ts index e59f9cd593..b49d48f887 100644 --- a/cloudflare-gastown/src/dos/town/review-queue.ts +++ b/cloudflare-gastown/src/dos/town/review-queue.ts @@ -289,6 +289,31 @@ export function completeReviewWithResult( const mergeTimestamp = now(); closeBead(sql, entry.bead_id, entry.agent_id); + // Close ALL other open/in_progress/failed MR beads for the same + // source bead. During rework cycles, multiple MR beads accumulate. + // Without this cleanup, stale MR beads trigger failReviewWithRework + // on the next alarm tick, reopening the source bead that was just + // closed by this merge. + query( + sql, + /* sql */ ` + UPDATE ${beads} + SET ${beads.columns.status} = 'closed', + ${beads.columns.updated_at} = ?, + ${beads.columns.closed_at} = ? + WHERE ${beads.type} = 'merge_request' + AND ${beads.bead_id} != ? + AND ${beads.status} NOT IN ('closed') + AND ${beads.bead_id} IN ( + SELECT dep.${bead_dependencies.columns.bead_id} + FROM ${bead_dependencies} AS dep + WHERE dep.${bead_dependencies.columns.depends_on_bead_id} = ? + AND dep.${bead_dependencies.columns.dependency_type} = 'tracks' + ) + `, + [mergeTimestamp, mergeTimestamp, input.entry_id, entry.bead_id] + ); + // closeBead → updateBeadStatus short-circuits when completeReview already // set the status to 'closed' via direct SQL, so updateConvoyProgress is // never reached transitively. Call it explicitly to ensure the convoy From a3ec77542e2a51c83d8a21deb809a01fa2eb47cb Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Thu, 19 Mar 2026 12:10:45 -0500 Subject: [PATCH 30/47] fix(gastown): skip popping MR beads whose source already has an in-flight review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit popReviewQueue was popping stale MR beads for the same source bead while a refinery was actively reviewing a different MR bead. This caused processReviewQueue to attempt starting a second refinery (which fails), then failReviewWithRework fires and reopens the source bead — interrupting the active review. Add NOT EXISTS subquery to skip MR beads whose source bead has a sibling MR bead already in_progress (indicating an active review). Combined with the stale MR cleanup on merge, this prevents the cascade where stale MR beads interfere with active reviews. --- .../src/dos/town/review-queue.ts | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/cloudflare-gastown/src/dos/town/review-queue.ts b/cloudflare-gastown/src/dos/town/review-queue.ts index b49d48f887..67bafed0d8 100644 --- a/cloudflare-gastown/src/dos/town/review-queue.ts +++ b/cloudflare-gastown/src/dos/town/review-queue.ts @@ -188,12 +188,34 @@ export function submitToReviewQueue(sql: SqlStorage, input: ReviewQueueInput): v } export function popReviewQueue(sql: SqlStorage): ReviewQueueEntry | null { + // Pop the oldest open MR bead, but skip any whose source bead already + // has another MR in_progress (i.e. a refinery is already reviewing it). + // This prevents popping stale MR beads and triggering failReviewWithRework + // while an active review is in flight for the same source. + // + // The source bead is linked via bead_dependencies (dependency_type='tracks'): + // bead_dependencies.bead_id = MR bead + // bead_dependencies.depends_on_bead_id = source bead const rows = [ ...query( sql, /* sql */ ` ${REVIEW_JOIN} WHERE ${beads.status} = 'open' + AND NOT EXISTS ( + SELECT 1 + FROM ${bead_dependencies} AS my_dep + INNER JOIN ${bead_dependencies} AS sib_dep + ON sib_dep.${bead_dependencies.columns.depends_on_bead_id} = my_dep.${bead_dependencies.columns.depends_on_bead_id} + AND sib_dep.${bead_dependencies.columns.dependency_type} = 'tracks' + INNER JOIN ${beads} AS sibling + ON sibling.${beads.columns.bead_id} = sib_dep.${bead_dependencies.columns.bead_id} + WHERE my_dep.${bead_dependencies.columns.bead_id} = ${beads.bead_id} + AND my_dep.${bead_dependencies.columns.dependency_type} = 'tracks' + AND sibling.${beads.columns.type} = 'merge_request' + AND sibling.${beads.columns.status} = 'in_progress' + AND sibling.${beads.columns.bead_id} != ${beads.bead_id} + ) ORDER BY ${beads.created_at} ASC LIMIT 1 `, From 0a9b889c7f207d9f7d2ad3dc830f916db34c38a5 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Thu, 19 Mar 2026 12:14:03 -0500 Subject: [PATCH 31/47] fix(gastown): never route refineries through agentCompleted from witnessPatrol MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit witnessPatrol's zombie detection was calling agentCompleted for refineries that showed as exited/completed. This races with gt_done: the refinery process exits after calling gt_done, witnessPatrol sees 'exited/completed' and calls agentCompleted, which finds the MR bead still in_progress (gt_done hasn't committed yet) and fails it. The refinery lifecycle is fully managed by gt_done (success) and recoverStuckReviews (timeout). witnessPatrol should only reset the refinery agent to idle and unhook — never touch the MR bead. --- cloudflare-gastown/src/dos/Town.do.ts | 35 ++++++++++++++++++--------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index 46da14fb59..c61b2163c8 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -2909,19 +2909,30 @@ export class TownDO extends DurableObject { const agent = agents.getAgent(this.sql, agentId); if (!agent) continue; - if (containerInfo.exitReason === 'completed') { - // Normal exit — route through agentCompleted for proper lifecycle + if (agent.role === 'refinery') { + // NEVER route refineries through agentCompleted from witnessPatrol. + // The refinery's lifecycle is managed by: + // - gt_done (success path): closes MR bead + source bead + // - recoverStuckReviews (timeout): resets MR bead to open for retry + // Calling agentCompleted here races with gt_done and can fail + // an MR bead that the refinery successfully merged. + agents.unhookBead(this.sql, agentId); + query( + this.sql, + /* sql */ ` + UPDATE ${agent_metadata} + SET ${agent_metadata.columns.status} = 'idle', + ${agent_metadata.columns.dispatch_attempts} = 0 + WHERE ${agent_metadata.bead_id} = ? + `, + [agentId] + ); + } else if (containerInfo.exitReason === 'completed') { + // Non-refinery normal exit — route through agentCompleted reviewQueue.agentCompleted(this.sql, agentId, { status: 'completed' }); } else { - // Abnormal death (container restart, OOM, crash). - // DON'T fail the bead — just reset the agent to idle. - // For polecats: keep hook so schedulePendingWork re-dispatches - // For refineries: unhook (recoverStuckReviews handles MR bead - // after timeout; the refinery may still call gt_done if the - // merge succeeded before the process died) - if (agent.role === 'refinery') { - agents.unhookBead(this.sql, agentId); - } + // Non-refinery abnormal death — reset to idle, keep hook + // so schedulePendingWork re-dispatches on next tick query( this.sql, /* sql */ ` @@ -2934,7 +2945,7 @@ export class TownDO extends DurableObject { ); console.log( `${TOWN_LOG} witnessPatrol: agent ${agentId} (${agent.role}) died abnormally — ` + - `reset to idle, bead status preserved for normal recovery` + `reset to idle, bead status preserved for recovery` ); } } From 6aabbd5f42c721f878eb94cb15e2f9af4e5a44e6 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Thu, 19 Mar 2026 12:42:12 -0500 Subject: [PATCH 32/47] =?UTF-8?q?fix(gastown):=20eliminate=20refinery=20ra?= =?UTF-8?q?ce=20conditions=20=E2=80=94=20never=20fail=20MR=20beads=20from?= =?UTF-8?q?=20recovery=20paths?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause: witnessPatrol and agentCompleted were racing with gt_done. The refinery process exits after calling gt_done, but the exit notification (agentCompleted or witnessPatrol zombie detection) can arrive at the DO before gt_done finishes. This caused the MR bead to be marked 'failed' even though the merge succeeded. Changes: 1. witnessPatrol: keep refinery hook intact instead of unhooking. This preserves the NOT EXISTS guard in recoverStuckReviews, preventing it from resetting MR beads while the refinery might still call gt_done. 2. agentCompleted: NEVER fail MR beads for refineries. Just unhook and idle. The refinery lifecycle is exclusively managed by gt_done (success) and recoverStuckReviews (timeout after 30 min). 3. Increase REVIEW_RUNNING_TIMEOUT_MS to 30 min — reviews legitimately take 10-15 min, and the hook guard is the primary protection. 4. Reduce recoverOrphanedSourceBeads timeout to 5 min — by the time it runs, all MR beads are already terminal. --- cloudflare-gastown/src/dos/Town.do.ts | 15 +++--- .../src/dos/town/review-queue.ts | 46 ++++++++----------- 2 files changed, 28 insertions(+), 33 deletions(-) diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index c61b2163c8..0b48461ebc 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -2910,13 +2910,14 @@ export class TownDO extends DurableObject { if (!agent) continue; if (agent.role === 'refinery') { - // NEVER route refineries through agentCompleted from witnessPatrol. - // The refinery's lifecycle is managed by: - // - gt_done (success path): closes MR bead + source bead - // - recoverStuckReviews (timeout): resets MR bead to open for retry - // Calling agentCompleted here races with gt_done and can fail - // an MR bead that the refinery successfully merged. - agents.unhookBead(this.sql, agentId); + // NEVER unhook or agentComplete refineries from witnessPatrol. + // The refinery's lifecycle is managed entirely by: + // - gt_done (success): closes MR bead + source bead + unhooks + // - recoverStuckReviews (timeout): resets MR bead to open + // Keeping the hook intact ensures recoverStuckReviews' NOT EXISTS + // guard skips the MR bead while the refinery might still call + // gt_done. Just reset status to idle so processReviewQueue knows + // the refinery is available for re-dispatch if needed. query( this.sql, /* sql */ ` diff --git a/cloudflare-gastown/src/dos/town/review-queue.ts b/cloudflare-gastown/src/dos/town/review-queue.ts index 67bafed0d8..8c698a119a 100644 --- a/cloudflare-gastown/src/dos/town/review-queue.ts +++ b/cloudflare-gastown/src/dos/town/review-queue.ts @@ -30,8 +30,9 @@ import type { ReviewQueueInput, ReviewQueueEntry, AgentDoneInput, Molecule } fro // Review entries stuck in 'running' past this timeout are reset to 'pending'. // Only applies when no agent (working or idle) is hooked to the MR bead. -// Set to 15 min to give the refinery ample time for clone + review + merge. -const REVIEW_RUNNING_TIMEOUT_MS = 15 * 60 * 1000; +// Set to 30 min — reviews can legitimately take 10-15 min for clone + build +// + test + merge, and the refinery hook guard is the primary protection. +const REVIEW_RUNNING_TIMEOUT_MS = 30 * 60 * 1000; function generateId(): string { return crypto.randomUUID(); @@ -565,7 +566,11 @@ export function closeOrphanedReviewBeads(sql: SqlStorage): void { * recovery timeout, to avoid interfering with in-flight reviews. */ export function recoverOrphanedSourceBeads(sql: SqlStorage): void { - const cutoff = new Date(Date.now() - REVIEW_RUNNING_TIMEOUT_MS).toISOString(); + // Use a shorter timeout than REVIEW_RUNNING_TIMEOUT_MS — by the time + // this runs, ALL MR beads for the source are already terminal (the + // NOT EXISTS guard below ensures this). The 5-min window just avoids + // interfering with in-flight transitions. + const cutoff = new Date(Date.now() - 5 * 60 * 1000).toISOString(); const stuckRows = [ ...query( @@ -804,29 +809,18 @@ export function agentCompleted( if (!agent) return result; if (agent.current_hook_bead_id) { - // When a refinery exits with 'completed' but the MR bead is still - // in_progress (not closed/merged), it means the refinery requested - // rework. Route through completeReviewWithResult so the source bead - // is returned to in_progress for re-dispatch. - if (agent.role === 'refinery' && input.status === 'completed') { - const mrBead = getBead(sql, agent.current_hook_bead_id); - if (mrBead && mrBead.status !== 'closed') { - const sourceBeadId = - typeof mrBead.metadata?.source_bead_id === 'string' - ? mrBead.metadata.source_bead_id - : null; - completeReviewWithResult(sql, { - entry_id: agent.current_hook_bead_id, - status: 'failed', - message: input.reason ?? 'Refinery exited without merge — rework needed', - }); - result.reworkSourceBeadId = sourceBeadId; - unhookBead(sql, agentId); - // Mark agent idle (below) - } else { - // MR was already closed (merged) — normal completion - unhookBead(sql, agentId); - } + if (agent.role === 'refinery') { + // NEVER fail an MR bead from agentCompleted. The refinery's lifecycle + // is managed by gt_done (success) and recoverStuckReviews (timeout). + // + // agentCompleted races with gt_done: the process may exit before + // gt_done's HTTP response reaches the DO, causing agentCompleted to + // arrive first. If we fail the MR here, we'd undo a successful merge. + // + // Just unhook and idle. If the refinery merged, gt_done will close + // the MR bead when it arrives. If the refinery crashed without + // merging, recoverStuckReviews resets the MR bead after timeout. + unhookBead(sql, agentId); } else { const beadStatus = input.status === 'completed' ? 'closed' : 'failed'; updateBeadStatus(sql, agent.current_hook_bead_id, beadStatus, agentId); From c502aaafd609fb6389d095f19b45b18841cbb330 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Thu, 19 Mar 2026 12:56:39 -0500 Subject: [PATCH 33/47] debug: add unauthenticated /debug/towns/:id/status endpoint and monitoring script MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Temporary debug infrastructure for monitoring town state without JWT auth. - GET /debug/towns/:townId/status — returns alarmStatus, agentMeta, beadSummary - scripts/monitor-town.sh — continuous polling script with formatted output REMOVE the debug endpoint before merging to main. --- cloudflare-gastown/scripts/monitor-town.sh | 84 ++++++++++++++++++++++ cloudflare-gastown/src/dos/Town.do.ts | 22 ++++++ cloudflare-gastown/src/gastown.worker.ts | 12 ++++ 3 files changed, 118 insertions(+) create mode 100755 cloudflare-gastown/scripts/monitor-town.sh diff --git a/cloudflare-gastown/scripts/monitor-town.sh b/cloudflare-gastown/scripts/monitor-town.sh new file mode 100755 index 0000000000..8b21095adb --- /dev/null +++ b/cloudflare-gastown/scripts/monitor-town.sh @@ -0,0 +1,84 @@ +#!/bin/bash +# Continuously monitor a town's state via the debug endpoint. +# Usage: ./scripts/monitor-town.sh [townId] [interval_seconds] + +TOWN_ID="${1:-8a6f9375-b806-4ee0-ad6e-1697ea2dbfff}" +INTERVAL="${2:-15}" +BASE_URL="${GASTOWN_URL:-https://gastown.kiloapps.io}" +URL="${BASE_URL}/debug/towns/${TOWN_ID}/status" + +echo "Monitoring town ${TOWN_ID} every ${INTERVAL}s" +echo "Endpoint: ${URL}" +echo "Press Ctrl+C to stop" +echo "==========================================" + +while true; do + RESP=$(curl -s --max-time 10 "${URL}" 2>/dev/null) + if [ -z "$RESP" ]; then + echo "$(date -u +%H:%M:%S) [ERROR] No response from ${URL}" + sleep "$INTERVAL" + continue + fi + + echo "$RESP" | python3 -c " +import sys, json, datetime + +try: + d = json.load(sys.stdin) +except: + print('$(date -u +%H:%M:%S) [ERROR] Invalid JSON response') + sys.exit(0) + +ts = datetime.datetime.utcnow().strftime('%H:%M:%S') +alarm = d.get('alarmStatus', {}) +agents_info = alarm.get('agents', {}) +beads_info = alarm.get('beads', {}) +patrol_info = alarm.get('patrol', {}) +events = alarm.get('recentEvents', []) + +working = agents_info.get('working', 0) +idle = agents_info.get('idle', 0) +op = beads_info.get('open', 0) +ip = beads_info.get('inProgress', 0) +ir = beads_info.get('inReview', 0) +failed = beads_info.get('failed', 0) +orphaned = patrol_info.get('orphanedHooks', 0) + +# Agent details +agents = d.get('agentMeta', []) +hooked_agents = [a for a in agents if a.get('current_hook_bead_id')] +refinery = [a for a in agents if a.get('role') == 'refinery'] + +# Non-terminal beads +beads = d.get('beadSummary', []) + +print(f'{ts} W={working} I={idle} | open={op} prog={ip} review={ir} fail={failed} | hooks={orphaned} hooked={len(hooked_agents)}') + +# Show refinery state +for r in refinery: + hook = r.get('current_hook_bead_id', 'NULL') or 'NULL' + print(f' refinery: status={r.get(\"status\",\"?\"):8s} hook={hook[:12]:12s} dispatch={r.get(\"dispatch_attempts\",0)}') + +# Show non-terminal beads +if beads: + for b in beads[:8]: + assignee = str(b.get('assignee_agent_bead_id', '') or '')[:8] + print(f' {b.get(\"status\",\"?\"):12s} {b.get(\"type\",\"?\"):16s} {str(b.get(\"bead_id\",\"\"))[:8]} agent={assignee:8s} {str(b.get(\"title\",\"\"))[:50]}') + if len(beads) > 8: + print(f' ... and {len(beads) - 8} more') + +# Show most recent event +if events: + e = events[0] + print(f' last: {e.get(\"time\",\"\")[:19]} {e.get(\"type\",\"\"):20s} {e.get(\"message\",\"\")[:70]}') + +# Show review outcomes +review_events = [e for e in events if e.get('type') == 'review_completed'] +for e in review_events[:2]: + print(f' REVIEW: {e.get(\"time\",\"\")[:19]} {e.get(\"message\",\"\")[:70]}') + +print() +" 2>/dev/null + + sleep "$INTERVAL" +done diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index 0b48461ebc..de5b965b16 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -4044,6 +4044,28 @@ export class TownDO extends DurableObject { }; } + // DEBUG: concise non-terminal bead summary — remove after debugging + async debugBeadSummary(): Promise { + return [ + ...query( + this.sql, + /* sql */ ` + SELECT ${beads.bead_id}, + ${beads.type}, + ${beads.status}, + ${beads.title}, + ${beads.assignee_agent_bead_id}, + ${beads.updated_at} + FROM ${beads} + WHERE ${beads.status} NOT IN ('closed', 'failed') + AND ${beads.type} != 'agent' + ORDER BY ${beads.type}, ${beads.status} + `, + [] + ), + ]; + } + // DEBUG: raw agent_metadata dump — remove after debugging async debugAgentMetadata(): Promise { return [ diff --git a/cloudflare-gastown/src/gastown.worker.ts b/cloudflare-gastown/src/gastown.worker.ts index 8282446cdf..ca8de34285 100644 --- a/cloudflare-gastown/src/gastown.worker.ts +++ b/cloudflare-gastown/src/gastown.worker.ts @@ -193,6 +193,18 @@ app.get('/', c => c.html(dashboardHtml())); app.get('/health', c => c.json({ status: 'ok' })); +// ── DEBUG: unauthenticated town introspection — REMOVE after debugging ── +app.get('/debug/towns/:townId/status', async c => { + const townId = c.req.param('townId'); + const town = getTownDOStub(c.env, townId); + const alarmStatus = await town.getAlarmStatus(); + // eslint-disable-next-line @typescript-eslint/await-thenable -- DO RPC returns promise at runtime + const agentMeta = await town.debugAgentMetadata(); + // eslint-disable-next-line @typescript-eslint/await-thenable + const beadSummary = await town.debugBeadSummary(); + return c.json({ alarmStatus, agentMeta, beadSummary }); +}); + // ── Town ID + Auth ────────────────────────────────────────────────────── // All rig routes live under /api/towns/:townId/rigs/:rigId so the townId // is always available from the URL path. From 4b16fee807c05562bd02d3a2f9d42b6bac5cd14d Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Thu, 19 Mar 2026 13:07:39 -0500 Subject: [PATCH 34/47] fix(gastown): don't fail MR beads when refinery start returns false MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit startAgentInContainer can return false due to a timeout race: the container is still setting up the agent (git clone, worktree creation) but the 60s AbortSignal fires. The container eventually finishes and the refinery starts reviewing, but processReviewQueue has already called failReviewWithRework which fails the MR bead and reopens the source bead. Now when the refinery start returns false, just unhook the refinery and leave the MR bead in in_progress. Two outcomes: - Agent actually started → refinery calls gt_done → MR bead closed - Agent truly failed → recoverStuckReviews resets after 30 min This eliminates the last path that prematurely fails MR beads. --- cloudflare-gastown/src/dos/Town.do.ts | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index de5b965b16..57b4b1e4b5 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -3336,19 +3336,25 @@ export class TownDO extends DurableObject { }); if (!started) { + // DON'T fail the MR bead — the container may have actually started + // the agent (timeout race: the fetch timed out but the container + // continued setting up the agent). Leave the MR bead in in_progress. + // If the agent truly failed, recoverStuckReviews will reset it to + // open after 30 min. If the agent succeeded, it will call gt_done + // and close the MR bead normally. + // + // Just unhook the refinery so processReviewQueue doesn't try to + // start a second instance on the next tick. agents.unhookBead(this.sql, refineryAgent.id); agents.updateAgentStatus(this.sql, refineryAgent.id, 'idle'); const containerError = dispatch.getLastStartError() ?? 'unknown'; - const errorCtx = `entry=${entry.id} rigId=${rigId} branch=${entry.branch} ` + - `containerError=${containerError}`; - console.error( - `${TOWN_LOG} processReviewQueue: refinery failed to start: ${errorCtx}` + console.warn( + `${TOWN_LOG} processReviewQueue: refinery start returned false for entry=${entry.id} — ` + + `leaving MR bead in_progress for recovery. error=${containerError}` ); agents.updateAgentStatusMessage(this.sql, refineryAgent.id, - `[dispatch_failed] ${containerError}`); - - this.failReviewWithRework(entry, 'Refinery container failed to start'); + `[start_uncertain] ${containerError}`); } } From b8966761a0186ab40dd47fdcdc82427c0a858b00 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Thu, 19 Mar 2026 13:17:31 -0500 Subject: [PATCH 35/47] fix(gastown): fix stale refinery hook deadlock in recoverStuckReviews MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous fix preserved the refinery hook in witnessPatrol (to prevent premature MR failure), but this created a deadlock: the idle refinery stayed hooked, blocking recoverStuckReviews (NOT EXISTS any agent hooked) from ever recovering the MR bead if gt_done never arrived. Three fixes: 1. recoverStuckReviews: change guard from NOT EXISTS (any agent hooked) to NOT EXISTS (working agent hooked). An idle refinery hooked to a stale MR means it died — the review should be recovered after timeout. 2. recoverStuckReviews: after resetting MR bead to open, unhook any idle refinery still pointing at it so it can be reused. 3. witnessPatrol: when the refinery dies and its MR bead is already closed/failed (gt_done completed), unhook immediately as cleanup. --- cloudflare-gastown/src/dos/Town.do.ts | 21 +++-- .../src/dos/town/review-queue.ts | 83 ++++++++++++++----- 2 files changed, 75 insertions(+), 29 deletions(-) diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index 57b4b1e4b5..ec2c7c8468 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -2910,14 +2910,13 @@ export class TownDO extends DurableObject { if (!agent) continue; if (agent.role === 'refinery') { - // NEVER unhook or agentComplete refineries from witnessPatrol. - // The refinery's lifecycle is managed entirely by: - // - gt_done (success): closes MR bead + source bead + unhooks - // - recoverStuckReviews (timeout): resets MR bead to open - // Keeping the hook intact ensures recoverStuckReviews' NOT EXISTS - // guard skips the MR bead while the refinery might still call - // gt_done. Just reset status to idle so processReviewQueue knows - // the refinery is available for re-dispatch if needed. + // Set refinery to idle. Keep the hook intact so + // recoverStuckReviews' guard (NOT EXISTS working agent) can + // distinguish between "refinery is actively working" (skip) + // and "refinery died" (recover after timeout). + // + // Exception: if gt_done already closed the MR bead, unhook + // the refinery as cleanup so it can be reused immediately. query( this.sql, /* sql */ ` @@ -2928,6 +2927,12 @@ export class TownDO extends DurableObject { `, [agentId] ); + if (agent.current_hook_bead_id) { + const mrBead = beadOps.getBead(this.sql, agent.current_hook_bead_id); + if (mrBead && (mrBead.status === 'closed' || mrBead.status === 'failed')) { + agents.unhookBead(this.sql, agentId); + } + } } else if (containerInfo.exitReason === 'completed') { // Non-refinery normal exit — route through agentCompleted reviewQueue.agentCompleted(this.sql, agentId, { status: 'completed' }); diff --git a/cloudflare-gastown/src/dos/town/review-queue.ts b/cloudflare-gastown/src/dos/town/review-queue.ts index 8c698a119a..8cea40d7e2 100644 --- a/cloudflare-gastown/src/dos/town/review-queue.ts +++ b/cloudflare-gastown/src/dos/town/review-queue.ts @@ -469,27 +469,68 @@ export function listPendingPRReviews(sql: SqlStorage): MergeRequestBeadRecord[] */ export function recoverStuckReviews(sql: SqlStorage): void { const timeout = new Date(Date.now() - REVIEW_RUNNING_TIMEOUT_MS).toISOString(); - query( - sql, - /* sql */ ` - UPDATE ${beads} - SET ${beads.columns.status} = 'open', - ${beads.columns.updated_at} = ? - WHERE ${beads.type} = 'merge_request' - AND ${beads.status} = 'in_progress' - AND ${beads.updated_at} < ? - AND ${beads.bead_id} NOT IN ( - SELECT ${review_metadata.bead_id} - FROM ${review_metadata} - WHERE ${review_metadata.pr_url} IS NOT NULL - ) - AND NOT EXISTS ( - SELECT 1 FROM ${agent_metadata} - WHERE ${agent_metadata.current_hook_bead_id} = ${beads.bead_id} - ) - `, - [now(), timeout] - ); + const timestamp = now(); + + // Find stuck MR beads: in_progress past the timeout, no pr_url, and + // no WORKING agent hooked. An idle agent hooked to the MR means the + // refinery died (witnessPatrol set it to idle) — the review should be + // recovered. Only skip if the agent is actively working. + const stuckMrRows = BeadRecord.pick({ bead_id: true }) + .array() + .parse([ + ...query( + sql, + /* sql */ ` + SELECT ${beads.bead_id} + FROM ${beads} + WHERE ${beads.type} = 'merge_request' + AND ${beads.status} = 'in_progress' + AND ${beads.updated_at} < ? + AND ${beads.bead_id} NOT IN ( + SELECT ${review_metadata.bead_id} + FROM ${review_metadata} + WHERE ${review_metadata.pr_url} IS NOT NULL + ) + AND NOT EXISTS ( + SELECT 1 FROM ${agent_metadata} + WHERE ${agent_metadata.current_hook_bead_id} = ${beads.bead_id} + AND ${agent_metadata.status} = 'working' + ) + `, + [timeout] + ), + ]); + + for (const row of stuckMrRows) { + // Reset MR bead to open for re-processing + query( + sql, + /* sql */ ` + UPDATE ${beads} + SET ${beads.columns.status} = 'open', + ${beads.columns.updated_at} = ? + WHERE ${beads.bead_id} = ? + `, + [timestamp, row.bead_id] + ); + + // Unhook any idle refinery still pointing at this MR bead so it + // can be reused for the next processReviewQueue cycle + query( + sql, + /* sql */ ` + UPDATE ${agent_metadata} + SET ${agent_metadata.columns.current_hook_bead_id} = NULL + WHERE ${agent_metadata.current_hook_bead_id} = ? + AND ${agent_metadata.status} = 'idle' + `, + [row.bead_id] + ); + + console.log( + `[review-queue] recoverStuckReviews: reset MR bead=${row.bead_id} to open, unhooked idle agents` + ); + } } /** From bb2d7c54b276a0dfd71470cf954848f3345bf621 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Thu, 19 Mar 2026 13:28:44 -0500 Subject: [PATCH 36/47] fix(gastown): don't roll back bead status on dispatch failure for any agent type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit dispatchAgent was rolling back beads from in_progress to open when startAgentInContainer returned false. But the container may have actually started the agent (timeout race — the fetch timed out but the container continued setup). The agent starts working, but the DO already rolled the bead back to open. Remove the bead rollback for both the !started and exception paths. Leave the bead in in_progress with the agent idle+hooked. Two outcomes: - Agent actually started → works normally → gt_done closes bead - Agent truly failed → rehookOrphanedBeads recovers after 2 min This is the same pattern applied to processReviewQueue in the previous commit, now extended to the general dispatch path. --- cloudflare-gastown/src/dos/town/scheduling.ts | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/cloudflare-gastown/src/dos/town/scheduling.ts b/cloudflare-gastown/src/dos/town/scheduling.ts index 348d9915bf..3f631dab65 100644 --- a/cloudflare-gastown/src/dos/town/scheduling.ts +++ b/cloudflare-gastown/src/dos/town/scheduling.ts @@ -153,13 +153,11 @@ export async function dispatchAgent( role: agent.role, }); } else { - // Container failed — roll back agent to idle, bead to open. - // Use bead.bead_id (the actual bead being dispatched) rather than - // agent.current_hook_bead_id which may be stale if the agent - // snapshot was taken before hookBead was called. - // Set last_activity_at to now() so the dispatch cooldown prevents - // immediate retry — the container may need time to start up after - // a deploy/eviction. + // Container start returned false — but the container may have + // actually started the agent (timeout race). DON'T roll back + // the bead to open. Leave it in_progress with the agent idle+hooked. + // If the agent truly failed: rehookOrphanedBeads recovers after 2 min. + // If the agent actually started: it works and calls gt_done normally. query( ctx.sql, /* sql */ ` @@ -170,7 +168,6 @@ export async function dispatchAgent( `, [now(), agent.id] ); - beadOps.updateBeadStatus(ctx.sql, bead.bead_id, 'open', agent.id); ctx.emitEvent({ event: 'agent.dispatch_failed', townId: ctx.townId, @@ -195,7 +192,7 @@ export async function dispatchAgent( `, [now(), agent.id] ); - beadOps.updateBeadStatus(ctx.sql, bead.bead_id, 'open', agent.id); + // Don't roll back bead to open — same timeout race rationale } catch (rollbackErr) { console.error(`${LOG} dispatchAgent: rollback also failed:`, rollbackErr); } From fb3f9209ed5a3a3bddb554de853673b4ff027618 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Thu, 19 Mar 2026 13:40:56 -0500 Subject: [PATCH 37/47] fix(gastown): eliminate all fire-and-forget rework dispatch races MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Comprehensive fix addressing the remaining review agent findings: 1. agentCompleted: don't unhook refinery. Leave hook intact so recoverStuckReviews' guard works and gt_done can arrive after agentCompleted without a race window. 2. witnessPatrol: only act on 'exited' for refineries, skip 'not_found'. not_found is ambiguous (container restarting, status check timeout) and falsely setting the refinery to idle enables premature recovery. 3. completeReviewWithResult failure/conflict path: set source bead to 'open' (not in_progress) and clear assignee. This lets the normal scheduling path (feedStrandedConvoys → hookBead → schedulePendingWork) handle rework instead of fire-and-forget dispatch that races with patrol recovery. 4. Remove all fire-and-forget rework dispatch code from failReviewWithRework, TownDO.completeReviewWithResult, and TownDO.agentCompleted. Rework is now exclusively handled by the alarm loop's scheduling/patrol functions, eliminating the race between dispatch and recovery. --- cloudflare-gastown/src/dos/Town.do.ts | 94 +++++-------------- .../src/dos/town/review-queue.ts | 56 +++++++---- 2 files changed, 59 insertions(+), 91 deletions(-) diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index ec2c7c8468..bec1589935 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -1242,35 +1242,10 @@ export class TownDO extends DurableObject { }); } - // When a review fails or conflicts (rework), the source bead was - // returned to in_progress. Re-hook a polecat and re-dispatch so the - // rework starts automatically. The original polecat may already be - // working on something else, so fall back to getOrCreateAgent. - if ((input.status === 'failed' || input.status === 'conflict') && sourceBeadId) { - const sourceBead = beadOps.getBead(this.sql, sourceBeadId); - if (sourceBead?.rig_id) { - try { - const reworkAgent = agents.getOrCreateAgent( - this.sql, - 'polecat', - sourceBead.rig_id, - this.townId - ); - agents.hookBead(this.sql, reworkAgent.id, sourceBeadId); - this.dispatchAgent(reworkAgent, sourceBead).catch(err => - console.error( - `${TOWN_LOG} completeReviewWithResult: fire-and-forget rework dispatch failed for bead=${sourceBeadId}`, - err - ) - ); - } catch (err) { - console.warn( - `${TOWN_LOG} completeReviewWithResult: could not dispatch rework for bead=${sourceBeadId}:`, - err - ); - } - } - } + // Rework is handled by the normal scheduling path: the failed/conflict + // path in completeReviewWithResult sets the source bead to 'open' with + // assignee cleared. feedStrandedConvoys or rehookOrphanedBeads will + // hook a polecat, and schedulePendingWork will dispatch it. } async agentDone(agentId: string, input: AgentDoneInput): Promise { @@ -2910,13 +2885,19 @@ export class TownDO extends DurableObject { if (!agent) continue; if (agent.role === 'refinery') { - // Set refinery to idle. Keep the hook intact so - // recoverStuckReviews' guard (NOT EXISTS working agent) can - // distinguish between "refinery is actively working" (skip) - // and "refinery died" (recover after timeout). - // + // For refineries, only act on definitive 'exited' status. + // 'not_found' is ambiguous — the container may be restarting + // or the status check may have timed out. Setting the refinery + // to idle on not_found would enable recoverStuckReviews to + // fire prematurely. + if (containerInfo.status === 'not_found') { + // Skip — don't touch the refinery. It may still be alive. + continue; + } + // Container confirmed exited. Set to idle, keep hook intact + // so recoverStuckReviews' guard works (checks status='working'). // Exception: if gt_done already closed the MR bead, unhook - // the refinery as cleanup so it can be reused immediately. + // as cleanup so the refinery can be reused immediately. query( this.sql, /* sql */ ` @@ -3364,14 +3345,11 @@ export class TownDO extends DurableObject { } /** - * Fail an MR bead via the full review lifecycle (completeReviewWithResult) - * so that convoy progress is updated and the source bead is returned to - * in_progress for rework. Mirrors the rework dispatch in - * completeReviewWithResult and agentCompleted. - * - * Used by processReviewQueue failure paths that previously called - * completeReview directly — which bypassed convoy progress and left the - * source bead stuck in in_review. + * Fail an MR bead via completeReviewWithResult. The source bead is + * returned to 'open' with its assignee cleared, so the normal + * scheduling path (feedStrandedConvoys → hookBead → schedulePendingWork) + * handles rework. No fire-and-forget dispatch — that pattern was prone + * to races with patrol recovery functions. */ private failReviewWithRework(entry: ReviewQueueEntry, reason: string): void { reviewQueue.completeReviewWithResult(this.sql, { @@ -3385,36 +3363,6 @@ export class TownDO extends DurableObject { townId: this.townId, beadId: entry.id, }); - - // The source bead was returned to in_progress by completeReviewWithResult. - // Attempt to dispatch a polecat for rework (same pattern as the public - // completeReviewWithResult method). - const sourceBeadId = entry.bead_id; - if (sourceBeadId && sourceBeadId !== entry.id) { - const sourceBead = beadOps.getBead(this.sql, sourceBeadId); - if (sourceBead?.rig_id) { - try { - const reworkAgent = agents.getOrCreateAgent( - this.sql, - 'polecat', - sourceBead.rig_id, - this.townId - ); - agents.hookBead(this.sql, reworkAgent.id, sourceBeadId); - this.dispatchAgent(reworkAgent, sourceBead).catch(err => - console.error( - `${TOWN_LOG} failReviewWithRework: rework dispatch failed for bead=${sourceBeadId}`, - err - ) - ); - } catch (err) { - console.warn( - `${TOWN_LOG} failReviewWithRework: could not dispatch rework for bead=${sourceBeadId}:`, - err - ); - } - } - } } /** diff --git a/cloudflare-gastown/src/dos/town/review-queue.ts b/cloudflare-gastown/src/dos/town/review-queue.ts index 8cea40d7e2..8fe5f31512 100644 --- a/cloudflare-gastown/src/dos/town/review-queue.ts +++ b/cloudflare-gastown/src/dos/town/review-queue.ts @@ -370,21 +370,40 @@ export function completeReviewWithResult( conflict: true, }, }); - // Return source bead to in_progress so the polecat can be re-dispatched - // to resolve the conflict (in_review → in_progress rework flow). - // Skip if source bead already reached a terminal state. + // Return source bead to open so the normal scheduling path handles + // rework. Clear assignee so feedStrandedConvoys can match. const conflictSourceBead = getBead(sql, entry.bead_id); if (conflictSourceBead && conflictSourceBead.status !== 'closed' && conflictSourceBead.status !== 'failed') { - updateBeadStatus(sql, entry.bead_id, 'in_progress', entry.agent_id); + updateBeadStatus(sql, entry.bead_id, 'open', entry.agent_id); + query( + sql, + /* sql */ ` + UPDATE ${beads} + SET ${beads.columns.assignee_agent_bead_id} = NULL + WHERE ${beads.bead_id} = ? + `, + [entry.bead_id] + ); } } else if (input.status === 'failed') { - // Review failed (rework requested): return source bead to in_progress - // so it can be re-dispatched (in_review → in_progress rework flow). - // BUT only if the source bead hasn't already reached a terminal state - // (e.g. closed by a different MR bead that merged successfully). + // Review failed (rework requested): return source bead to open so + // the normal scheduling path (feedStrandedConvoys → hookBead → + // schedulePendingWork → dispatch) handles rework. Clear the stale + // assignee so feedStrandedConvoys can match (requires assignee IS NULL). + // This avoids the fire-and-forget rework dispatch race in TownDO + // where the dispatch fails and rehookOrphanedBeads churn. const sourceBead = getBead(sql, entry.bead_id); if (sourceBead && sourceBead.status !== 'closed' && sourceBead.status !== 'failed') { - updateBeadStatus(sql, entry.bead_id, 'in_progress', entry.agent_id); + updateBeadStatus(sql, entry.bead_id, 'open', entry.agent_id); + query( + sql, + /* sql */ ` + UPDATE ${beads} + SET ${beads.columns.assignee_agent_bead_id} = NULL + WHERE ${beads.bead_id} = ? + `, + [entry.bead_id] + ); } } } @@ -851,17 +870,18 @@ export function agentCompleted( if (agent.current_hook_bead_id) { if (agent.role === 'refinery') { - // NEVER fail an MR bead from agentCompleted. The refinery's lifecycle - // is managed by gt_done (success) and recoverStuckReviews (timeout). + // NEVER fail or unhook a refinery from agentCompleted. + // agentCompleted races with gt_done: the process exits, the + // container sends /completed, but gt_done's HTTP request may + // still be in flight. If we unhook here, recoverStuckReviews + // can fire between agentCompleted and gt_done, resetting the + // MR bead that's about to be closed by gt_done. // - // agentCompleted races with gt_done: the process may exit before - // gt_done's HTTP response reaches the DO, causing agentCompleted to - // arrive first. If we fail the MR here, we'd undo a successful merge. + // Leave the hook intact. gt_done will close + unhook if the + // merge succeeded. recoverStuckReviews (which checks for + // status='working') handles the case where gt_done never arrives. // - // Just unhook and idle. If the refinery merged, gt_done will close - // the MR bead when it arrives. If the refinery crashed without - // merging, recoverStuckReviews resets the MR bead after timeout. - unhookBead(sql, agentId); + // No-op for the bead — just fall through to mark agent idle. } else { const beadStatus = input.status === 'completed' ? 'closed' : 'failed'; updateBeadStatus(sql, agent.current_hook_bead_id, beadStatus, agentId); From 807efb4a1dac036276c311118c29556fdf426311 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Thu, 19 Mar 2026 14:17:04 -0500 Subject: [PATCH 38/47] fix(gastown): skip not_found for ALL agents in witnessPatrol + add merge path logging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit witnessPatrol was setting non-refinery agents to idle on transient not_found container status, causing working agents to appear idle. Extend the not_found skip to all agent types — only act on confirmed exited status. Add diagnostic logging to completeReviewWithResult merged path to trace why source beads are not being closed after successful merges. --- cloudflare-gastown/src/dos/Town.do.ts | 25 ++++++------------- .../src/dos/town/review-queue.ts | 5 ++++ 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index bec1589935..ddf703e57e 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -2881,23 +2881,18 @@ export class TownDO extends DurableObject { const { agentId, containerInfo } = result.value; if (containerInfo.status === 'not_found' || containerInfo.status === 'exited') { + // 'not_found' is ambiguous for ALL agent types — the container + // may be restarting, the status check may have timed out, or + // the process manager hasn't registered the agent yet. Only act + // on confirmed 'exited' status. + if (containerInfo.status === 'not_found') continue; + const agent = agents.getAgent(this.sql, agentId); if (!agent) continue; if (agent.role === 'refinery') { - // For refineries, only act on definitive 'exited' status. - // 'not_found' is ambiguous — the container may be restarting - // or the status check may have timed out. Setting the refinery - // to idle on not_found would enable recoverStuckReviews to - // fire prematurely. - if (containerInfo.status === 'not_found') { - // Skip — don't touch the refinery. It may still be alive. - continue; - } - // Container confirmed exited. Set to idle, keep hook intact - // so recoverStuckReviews' guard works (checks status='working'). - // Exception: if gt_done already closed the MR bead, unhook - // as cleanup so the refinery can be reused immediately. + // Set to idle, keep hook intact for recoverStuckReviews guard. + // If gt_done already closed the MR bead, unhook as cleanup. query( this.sql, /* sql */ ` @@ -2930,10 +2925,6 @@ export class TownDO extends DurableObject { `, [agentId] ); - console.log( - `${TOWN_LOG} witnessPatrol: agent ${agentId} (${agent.role}) died abnormally — ` + - `reset to idle, bead status preserved for recovery` - ); } } } diff --git a/cloudflare-gastown/src/dos/town/review-queue.ts b/cloudflare-gastown/src/dos/town/review-queue.ts index 8fe5f31512..d515dd3ced 100644 --- a/cloudflare-gastown/src/dos/town/review-queue.ts +++ b/cloudflare-gastown/src/dos/town/review-queue.ts @@ -310,6 +310,11 @@ export function completeReviewWithResult( if (input.status === 'merged') { const mergeTimestamp = now(); + console.log( + `[review-queue] completeReviewWithResult MERGED: entry_id=${input.entry_id} ` + + `entry.bead_id (source)=${entry.bead_id} entry.id (MR)=${entry.id} — ` + + `calling closeBead on source` + ); closeBead(sql, entry.bead_id, entry.agent_id); // Close ALL other open/in_progress/failed MR beads for the same From c8832f51425c01853d5de645b096b237116a9323 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Thu, 19 Mar 2026 14:33:49 -0500 Subject: [PATCH 39/47] fix(gastown): unhook stale refinery before re-hooking + fast recovery for abandoned MR beads Two fixes for MR beads stuck in in_progress: 1. processReviewQueue: unhook the refinery from its previous MR bead before hooking to the new one. agentCompleted preserves the refinery hook, so when processReviewQueue runs next, hookBead throws 'already hooked'. This crashed processReviewQueue silently (caught by Promise.allSettled), leaving the MR bead stuck in in_progress with no agent. 2. recoverStuckReviews: add fast 2-min recovery for MR beads that are in_progress with NO agent hooked at all. These are clearly abandoned (hookBead threw or the refinery was unhooked by another path). The 30-min timeout remains for MR beads with an idle refinery hooked (waiting for gt_done to arrive). --- cloudflare-gastown/src/dos/Town.do.ts | 5 +++ .../src/dos/town/review-queue.ts | 40 +++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index ddf703e57e..3a7622d83c 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -3283,6 +3283,11 @@ export class TownDO extends DurableObject { // Hook the refinery to the MR bead (entry.id), not the source bead // (entry.bead_id). The source bead stays closed with its original // polecat assignee preserved. + // If the refinery is still hooked to a previous MR bead (agentCompleted + // preserves hooks for refineries), unhook first. + if (refineryAgent.current_hook_bead_id && refineryAgent.current_hook_bead_id !== entry.id) { + agents.unhookBead(this.sql, refineryAgent.id); + } agents.hookBead(this.sql, refineryAgent.id, entry.id); // Mark as working before the async container start (same I/O gate diff --git a/cloudflare-gastown/src/dos/town/review-queue.ts b/cloudflare-gastown/src/dos/town/review-queue.ts index d515dd3ced..049e00a000 100644 --- a/cloudflare-gastown/src/dos/town/review-queue.ts +++ b/cloudflare-gastown/src/dos/town/review-queue.ts @@ -555,6 +555,46 @@ export function recoverStuckReviews(sql: SqlStorage): void { `[review-queue] recoverStuckReviews: reset MR bead=${row.bead_id} to open, unhooked idle agents` ); } + + // Fast recovery: MR beads in_progress with NO agent hooked at all are + // clearly abandoned (hookBead threw or the refinery was unhooked). + // Recover after 2 min instead of waiting for the 30-min timeout. + const abandonedCutoff = new Date(Date.now() - 2 * 60 * 1000).toISOString(); + const abandonedRows = BeadRecord.pick({ bead_id: true }) + .array() + .parse([ + ...query( + sql, + /* sql */ ` + SELECT ${beads.bead_id} + FROM ${beads} + WHERE ${beads.type} = 'merge_request' + AND ${beads.status} = 'in_progress' + AND ${beads.updated_at} < ? + AND NOT EXISTS ( + SELECT 1 FROM ${agent_metadata} + WHERE ${agent_metadata.current_hook_bead_id} = ${beads.bead_id} + ) + `, + [abandonedCutoff] + ), + ]); + + for (const row of abandonedRows) { + query( + sql, + /* sql */ ` + UPDATE ${beads} + SET ${beads.columns.status} = 'open', + ${beads.columns.updated_at} = ? + WHERE ${beads.bead_id} = ? + `, + [timestamp, row.bead_id] + ); + console.log( + `[review-queue] recoverStuckReviews: fast-recovered abandoned MR bead=${row.bead_id} (no agent hooked)` + ); + } } /** From ad150c05342c72bfade6a886eb5bedca95459efc Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Thu, 19 Mar 2026 14:45:59 -0500 Subject: [PATCH 40/47] fix(gastown): set refinery to idle on not_found (don't skip entirely) Skipping not_found for refineries caused them to stay 'working' in the DB permanently when the container process died. witnessPatrol never reset them, and recoverStuckReviews' guard (no WORKING agent) meant 30-min recovery also didn't fire. Now set the refinery to idle on both not_found and exited (same as before but without skipping). Keep the hook intact so gt_done can still arrive and close the MR. If the refinery is idle+hooked: - processReviewQueue unhooks it before hooking to the new MR (prev fix) - recoverStuckReviews recovers after 30 min (guard: no working agent) - 2-min fast recovery handles MR beads left with no hook at all Non-refineries: reset to idle on both not_found and exited (unchanged). --- cloudflare-gastown/src/dos/Town.do.ts | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index 3a7622d83c..0c6f784c96 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -2881,18 +2881,15 @@ export class TownDO extends DurableObject { const { agentId, containerInfo } = result.value; if (containerInfo.status === 'not_found' || containerInfo.status === 'exited') { - // 'not_found' is ambiguous for ALL agent types — the container - // may be restarting, the status check may have timed out, or - // the process manager hasn't registered the agent yet. Only act - // on confirmed 'exited' status. - if (containerInfo.status === 'not_found') continue; - const agent = agents.getAgent(this.sql, agentId); if (!agent) continue; if (agent.role === 'refinery') { - // Set to idle, keep hook intact for recoverStuckReviews guard. - // If gt_done already closed the MR bead, unhook as cleanup. + // For refineries: set to idle, keep hook intact. + // - gt_done may still arrive and close the MR bead normally + // - recoverStuckReviews (guard: no WORKING agent) will recover + // after 30 min if gt_done never arrives + // - If MR bead is already closed (gt_done already ran), unhook query( this.sql, /* sql */ ` From 6db57226ebb42190cf42f635d6e004620ba63e75 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Thu, 19 Mar 2026 14:51:35 -0500 Subject: [PATCH 41/47] fix(gastown): add refinery dispatch retry in processReviewQueue After a container restart, the first startAgentInContainer call may fail (container not ready). The refinery is left idle+hooked to the MR bead (in_progress). Without retries, the MR sits in in_progress for 30 min until recoverStuckReviews resets it. Now processReviewQueue checks for this state at the start of each tick: if the refinery is idle+hooked to an in_progress MR bead, re-dispatch it via dispatchAgent. This retries every 5 seconds (alarm interval) until the container accepts the start. --- cloudflare-gastown/src/dos/Town.do.ts | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index 0c6f784c96..4bafcb0981 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -3182,6 +3182,24 @@ export class TownDO extends DurableObject { // Poll open PRs created by the 'pr' strategy await this.pollPendingPRs(); + // Retry: if the refinery is idle but still hooked to an in_progress + // MR bead, the previous dispatch failed (container not ready). Re- + // dispatch via dispatchAgent which handles the full startup flow. + const refineryForRetry = agents.listAgents(this.sql, { role: 'refinery' })[0]; + if ( + refineryForRetry?.status === 'idle' && + refineryForRetry.current_hook_bead_id + ) { + const hookedMr = beadOps.getBead(this.sql, refineryForRetry.current_hook_bead_id); + if (hookedMr?.status === 'in_progress' && hookedMr.type === 'merge_request') { + console.log( + `${TOWN_LOG} processReviewQueue: retrying refinery dispatch for MR bead=${hookedMr.bead_id}` + ); + await this.dispatchAgent(refineryForRetry, hookedMr); + return; + } + } + const entry = reviewQueue.popReviewQueue(this.sql); if (!entry) return; From aaf31d52ec4043e0288de8ef88528f2ad7fa71da Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Thu, 19 Mar 2026 15:31:06 -0500 Subject: [PATCH 42/47] fix(gastown): keep refinery hook on start failure + block popping when any MR in-progress MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three fixes for the cascading MR bead problem: 1. processReviewQueue start failure: keep the refinery hook intact instead of unhooking. The retry block at the top of processReviewQueue handles idle+hooked refineries. By preserving the hook, we prevent popping new MR beads while the container is unavailable — the retry block retries the same MR each tick. 2. popReviewQueue: don't pop ANY open MR if there's already an in_progress MR for the same rig. Since there's only one refinery per rig, popping a second MR while the first is in_progress is always wrong. 3. Clear refinery checkpoint when unhooking from a previous MR bead to prevent stale conversation context leaking into new reviews. --- cloudflare-gastown/src/dos/Town.do.ts | 18 +++++++----------- .../src/dos/town/review-queue.ts | 16 ++++------------ 2 files changed, 11 insertions(+), 23 deletions(-) diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index 4bafcb0981..24ef7dae16 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -3299,9 +3299,10 @@ export class TownDO extends DurableObject { // (entry.bead_id). The source bead stays closed with its original // polecat assignee preserved. // If the refinery is still hooked to a previous MR bead (agentCompleted - // preserves hooks for refineries), unhook first. + // preserves hooks for refineries), unhook first and clear stale checkpoint. if (refineryAgent.current_hook_bead_id && refineryAgent.current_hook_bead_id !== entry.id) { agents.unhookBead(this.sql, refineryAgent.id); + agents.writeCheckpoint(this.sql, refineryAgent.id, null); } agents.hookBead(this.sql, refineryAgent.id, entry.id); @@ -3333,16 +3334,11 @@ export class TownDO extends DurableObject { }); if (!started) { - // DON'T fail the MR bead — the container may have actually started - // the agent (timeout race: the fetch timed out but the container - // continued setting up the agent). Leave the MR bead in in_progress. - // If the agent truly failed, recoverStuckReviews will reset it to - // open after 30 min. If the agent succeeded, it will call gt_done - // and close the MR bead normally. - // - // Just unhook the refinery so processReviewQueue doesn't try to - // start a second instance on the next tick. - agents.unhookBead(this.sql, refineryAgent.id); + // Keep hook intact — the retry block at the top of processReviewQueue + // will re-dispatch on the next alarm tick. By preserving the hook, + // we prevent the cascade of popping new MRs when the container is + // temporarily unavailable. recoverStuckReviews clears after 30 min + // if retries never succeed. agents.updateAgentStatus(this.sql, refineryAgent.id, 'idle'); const containerError = dispatch.getLastStartError() ?? 'unknown'; diff --git a/cloudflare-gastown/src/dos/town/review-queue.ts b/cloudflare-gastown/src/dos/town/review-queue.ts index 049e00a000..f9e9590935 100644 --- a/cloudflare-gastown/src/dos/town/review-queue.ts +++ b/cloudflare-gastown/src/dos/town/review-queue.ts @@ -204,18 +204,10 @@ export function popReviewQueue(sql: SqlStorage): ReviewQueueEntry | null { ${REVIEW_JOIN} WHERE ${beads.status} = 'open' AND NOT EXISTS ( - SELECT 1 - FROM ${bead_dependencies} AS my_dep - INNER JOIN ${bead_dependencies} AS sib_dep - ON sib_dep.${bead_dependencies.columns.depends_on_bead_id} = my_dep.${bead_dependencies.columns.depends_on_bead_id} - AND sib_dep.${bead_dependencies.columns.dependency_type} = 'tracks' - INNER JOIN ${beads} AS sibling - ON sibling.${beads.columns.bead_id} = sib_dep.${bead_dependencies.columns.bead_id} - WHERE my_dep.${bead_dependencies.columns.bead_id} = ${beads.bead_id} - AND my_dep.${bead_dependencies.columns.dependency_type} = 'tracks' - AND sibling.${beads.columns.type} = 'merge_request' - AND sibling.${beads.columns.status} = 'in_progress' - AND sibling.${beads.columns.bead_id} != ${beads.bead_id} + SELECT 1 FROM ${beads} AS active_mr + WHERE active_mr.${beads.columns.type} = 'merge_request' + AND active_mr.${beads.columns.status} = 'in_progress' + AND active_mr.${beads.columns.rig_id} = ${beads.rig_id} ) ORDER BY ${beads.created_at} ASC LIMIT 1 From 016d6cc51474edd0e23e11061eec3c320a855a2b Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Thu, 19 Mar 2026 15:38:56 -0500 Subject: [PATCH 43/47] fix(gastown): treat 'already running' container response as successful start When startAgentInContainer gets a 500 with 'already running', the agent process IS alive from a previous dispatch attempt. The previous dispatch returned false (timeout or transient error) but the container actually started the agent. Subsequent retry attempts get 'already running' and return false, causing the DO to keep the refinery idle while the agent is actually running. Detect 'already running' in the error response and return true so the DO correctly marks the agent as working. --- cloudflare-gastown/src/dos/town/container-dispatch.ts | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/cloudflare-gastown/src/dos/town/container-dispatch.ts b/cloudflare-gastown/src/dos/town/container-dispatch.ts index 96acc42330..4ada979758 100644 --- a/cloudflare-gastown/src/dos/town/container-dispatch.ts +++ b/cloudflare-gastown/src/dos/town/container-dispatch.ts @@ -437,12 +437,20 @@ export async function startAgentInContainer( if (!response.ok) { const text = await response.text().catch(() => '(unreadable)'); + // "Already running" means a previous dispatch succeeded — the agent + // IS alive in the container. Treat as success so the DO marks the + // agent as working and stops retrying. + if (response.status === 500 && text.includes('already running')) { + console.log( + `${TOWN_LOG} startAgentInContainer: agent ${params.agentId} already running — treating as success` + ); + return true; + } const errorMsg = `(${response.status}) ${text.slice(0, 300)}`; console.error( `${TOWN_LOG} startAgentInContainer: error response for ` + `agent=${params.agentId} role=${params.role}: ${errorMsg}` ); - // Store error on a well-known key so the caller can read it lastStartError = errorMsg; } return response.ok; From 52b446d8b7da7051541830325799d3e708531a95 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Thu, 19 Mar 2026 15:57:18 -0500 Subject: [PATCH 44/47] fix(gastown): check container status before retrying refinery dispatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The retry block was calling dispatchAgent without checking if the agent is already running in the container. When the original dispatch succeeded (but we got a false negative from timeout), the retry sends another /agents/start which gets 'already running'. While startAgentInContainer now handles this, it's cleaner to check first. Now: check container status → if running, restore to 'working' (no dispatch needed). If not running, dispatch normally. --- cloudflare-gastown/src/dos/Town.do.ts | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index 24ef7dae16..4ffc95dde1 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -3183,8 +3183,10 @@ export class TownDO extends DurableObject { await this.pollPendingPRs(); // Retry: if the refinery is idle but still hooked to an in_progress - // MR bead, the previous dispatch failed (container not ready). Re- - // dispatch via dispatchAgent which handles the full startup flow. + // MR bead, either the previous dispatch failed (container not ready) + // or the dispatch succeeded but we got a false negative (timeout). + // Check the container first — if the agent is running, just restore + // the working status. If not, re-dispatch. const refineryForRetry = agents.listAgents(this.sql, { role: 'refinery' })[0]; if ( refineryForRetry?.status === 'idle' && @@ -3192,6 +3194,15 @@ export class TownDO extends DurableObject { ) { const hookedMr = beadOps.getBead(this.sql, refineryForRetry.current_hook_bead_id); if (hookedMr?.status === 'in_progress' && hookedMr.type === 'merge_request') { + const containerStatus = await dispatch.checkAgentContainerStatus( + this.env, this.townId, refineryForRetry.id + ); + if (containerStatus.status === 'running') { + // Agent IS running — restore working status + agents.updateAgentStatus(this.sql, refineryForRetry.id, 'working'); + return; + } + // Agent is NOT running — genuinely needs re-dispatch console.log( `${TOWN_LOG} processReviewQueue: retrying refinery dispatch for MR bead=${hookedMr.bead_id}` ); From ed685364ffd8ad7bcab8b83c12e16e96fc1d6b8f Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Thu, 19 Mar 2026 16:20:48 -0500 Subject: [PATCH 45/47] fix(gastown): fix PR-strategy MR beads stuck after external merge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three fixes for PR-strategy review beads: 1. recoverStuckReviews fast-recovery: exclude MR beads with pr_url. The fast 2-min recovery was resetting PR-strategy beads to open, making them invisible to pollPendingPRs (which only queries in_progress beads). This caused an infinite cycle: pop → dispatch → refinery sees existing PR → gt_done with pr_url → unhook → 2min → fast recovery resets to open → pop again. 2. pollPendingPRs: use this.completeReviewWithResult (the TownDO wrapper) instead of reviewQueue.completeReviewWithResult directly. The wrapper emits events and calls dispatchUnblockedBeads, which is critical for unblocking downstream convoy beads after a PR merge. 3. closeOrphanedReviewBeads: use completeReviewWithResult('failed') instead of closeBead. closeBead only closes the MR bead without transitioning the source bead, leaving it stuck in in_review. --- cloudflare-gastown/src/dos/Town.do.ts | 6 ++++-- .../src/dos/town/review-queue.ts | 18 +++++++++++++++--- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index 4ffc95dde1..f223d8f8df 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -3568,14 +3568,16 @@ export class TownDO extends DurableObject { if (!status) continue; if (status === 'merged') { - reviewQueue.completeReviewWithResult(this.sql, { + // Use the TownDO wrapper (not the module function directly) + // so dispatchUnblockedBeads fires and events are emitted. + await this.completeReviewWithResult({ entry_id: review.bead_id, status: 'merged', message: 'PR merged externally', }); console.log(`${TOWN_LOG} pollPendingPRs: PR merged for entry=${review.bead_id}`); } else if (status === 'closed') { - reviewQueue.completeReviewWithResult(this.sql, { + await this.completeReviewWithResult({ entry_id: review.bead_id, status: 'failed', message: 'PR closed without merge', diff --git a/cloudflare-gastown/src/dos/town/review-queue.ts b/cloudflare-gastown/src/dos/town/review-queue.ts index f9e9590935..979028b020 100644 --- a/cloudflare-gastown/src/dos/town/review-queue.ts +++ b/cloudflare-gastown/src/dos/town/review-queue.ts @@ -558,7 +558,7 @@ export function recoverStuckReviews(sql: SqlStorage): void { ...query( sql, /* sql */ ` - SELECT ${beads.bead_id} + SELECT ${beads.bead_id} FROM ${beads} WHERE ${beads.type} = 'merge_request' AND ${beads.status} = 'in_progress' @@ -567,6 +567,11 @@ export function recoverStuckReviews(sql: SqlStorage): void { SELECT 1 FROM ${agent_metadata} WHERE ${agent_metadata.current_hook_bead_id} = ${beads.bead_id} ) + AND ${beads.bead_id} NOT IN ( + SELECT ${review_metadata.bead_id} + FROM ${review_metadata} + WHERE ${review_metadata.pr_url} IS NOT NULL + ) `, [abandonedCutoff] ), @@ -635,9 +640,16 @@ export function closeOrphanedReviewBeads(sql: SqlStorage): void { .object({ bead_id: z.string(), assignee_agent_bead_id: z.string().nullable() }) .parse(row); try { - closeBead(sql, parsed.bead_id, parsed.assignee_agent_bead_id ?? 'system'); + // Use completeReviewWithResult instead of closeBead so the source + // bead is also transitioned (closeBead only closes the MR bead + // itself, leaving the source stuck in in_review). + completeReviewWithResult(sql, { + entry_id: parsed.bead_id, + status: 'failed', + message: 'PR review orphaned — agent died and polling could not resolve', + }); console.log( - `[review-queue] closeOrphanedReviewBeads: closed orphaned MR bead=${parsed.bead_id}` + `[review-queue] closeOrphanedReviewBeads: failed orphaned MR bead=${parsed.bead_id}` ); } catch (err) { console.warn( From 8b62a7e6f4f6fc4d8db85d902938f06ff649741e Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Thu, 19 Mar 2026 16:31:01 -0500 Subject: [PATCH 46/47] fix(gastown): unhook refinery from terminal MR beads at start of processReviewQueue When gt_done or pollPendingPRs closes an MR bead while the refinery is still running in the container, the refinery stays 'working' and hooked to the closed MR. processReviewQueue sees the refinery as non-idle and re-queues new MR beads instead of dispatching them. Add cleanup at the top of processReviewQueue: if the refinery is hooked to a closed/failed MR bead, unhook it, set to idle, and clear the checkpoint. This frees the refinery for new work immediately. --- cloudflare-gastown/src/dos/Town.do.ts | 29 ++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index f223d8f8df..790ad323ba 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -3182,6 +3182,22 @@ export class TownDO extends DurableObject { // Poll open PRs created by the 'pr' strategy await this.pollPendingPRs(); + // Cleanup: if the refinery is hooked to a terminal MR bead (closed/failed), + // unhook it so it's available for new work. This happens when gt_done or + // pollPendingPRs closes the MR while the refinery is still running. + const existingRefinery = agents.listAgents(this.sql, { role: 'refinery' })[0]; + if (existingRefinery?.current_hook_bead_id) { + const hookedMr = beadOps.getBead(this.sql, existingRefinery.current_hook_bead_id); + if (hookedMr && (hookedMr.status === 'closed' || hookedMr.status === 'failed')) { + agents.unhookBead(this.sql, existingRefinery.id); + agents.updateAgentStatus(this.sql, existingRefinery.id, 'idle'); + agents.writeCheckpoint(this.sql, existingRefinery.id, null); + console.log( + `${TOWN_LOG} processReviewQueue: unhooked refinery from terminal MR bead=${hookedMr.bead_id}` + ); + } + } + // Retry: if the refinery is idle but still hooked to an in_progress // MR bead, either the previous dispatch failed (container not ready) // or the dispatch succeeded but we got a false negative (timeout). @@ -3192,21 +3208,19 @@ export class TownDO extends DurableObject { refineryForRetry?.status === 'idle' && refineryForRetry.current_hook_bead_id ) { - const hookedMr = beadOps.getBead(this.sql, refineryForRetry.current_hook_bead_id); - if (hookedMr?.status === 'in_progress' && hookedMr.type === 'merge_request') { + const hookedRetryMr = beadOps.getBead(this.sql, refineryForRetry.current_hook_bead_id); + if (hookedRetryMr?.status === 'in_progress' && hookedRetryMr.type === 'merge_request') { const containerStatus = await dispatch.checkAgentContainerStatus( this.env, this.townId, refineryForRetry.id ); if (containerStatus.status === 'running') { - // Agent IS running — restore working status agents.updateAgentStatus(this.sql, refineryForRetry.id, 'working'); return; } - // Agent is NOT running — genuinely needs re-dispatch console.log( - `${TOWN_LOG} processReviewQueue: retrying refinery dispatch for MR bead=${hookedMr.bead_id}` + `${TOWN_LOG} processReviewQueue: retrying refinery dispatch for MR bead=${hookedRetryMr.bead_id}` ); - await this.dispatchAgent(refineryForRetry, hookedMr); + await this.dispatchAgent(refineryForRetry, hookedRetryMr); return; } } @@ -3278,7 +3292,8 @@ export class TownDO extends DurableObject { // Get or create the per-rig refinery. If it already exists and is busy // (processing another review), put the entry back to 'open' so it gets - // retried on the next alarm cycle. + // retried on the next alarm cycle. Re-fetch since the cleanup block + // above may have changed it. const refineryAgent = agents.getOrCreateAgent(this.sql, 'refinery', rigId, this.townId); if (refineryAgent.status !== 'idle') { console.log( From f50b14784ded5879fa1df1a0e3d99bde200484ed Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Thu, 19 Mar 2026 17:05:59 -0500 Subject: [PATCH 47/47] fix(gastown): check container status before freeing refinery from terminal MR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After gt_done closes an MR bead, the refinery process is still running in the container (finishing its LLM turn). The cleanup block in processReviewQueue was immediately unhooking the refinery and making it available for new work, causing it to be dispatched for a new MR while the old session was still active in the container. Now check container status first: if the refinery is still running, skip cleanup and return — don't pop any new MR this tick. Wait for agentCompleted to fire (session ends), then clean up on the next tick. This prevents dispatching a new review to a refinery whose container session is still active from the previous review. --- cloudflare-gastown/src/dos/Town.do.ts | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index 790ad323ba..e6a0400578 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -3183,12 +3183,21 @@ export class TownDO extends DurableObject { await this.pollPendingPRs(); // Cleanup: if the refinery is hooked to a terminal MR bead (closed/failed), - // unhook it so it's available for new work. This happens when gt_done or - // pollPendingPRs closes the MR while the refinery is still running. + // check if the container session has actually ended before making it + // available. The refinery may still be running (finishing its LLM turn + // after gt_done returned). If still running, skip — wait for + // agentCompleted to fire, then clean up on the next tick. const existingRefinery = agents.listAgents(this.sql, { role: 'refinery' })[0]; if (existingRefinery?.current_hook_bead_id) { const hookedMr = beadOps.getBead(this.sql, existingRefinery.current_hook_bead_id); if (hookedMr && (hookedMr.status === 'closed' || hookedMr.status === 'failed')) { + const containerStatus = await dispatch.checkAgentContainerStatus( + this.env, this.townId, existingRefinery.id + ); + if (containerStatus.status === 'running') { + // Session still active — don't unhook or pop new MR this tick + return; + } agents.unhookBead(this.sql, existingRefinery.id); agents.updateAgentStatus(this.sql, existingRefinery.id, 'idle'); agents.writeCheckpoint(this.sql, existingRefinery.id, null);