diff --git a/cloudflare-gastown/container/plugin/client.test.ts b/cloudflare-gastown/container/plugin/client.test.ts index 35f05bb6b5..3c3c86ceb1 100644 --- a/cloudflare-gastown/container/plugin/client.test.ts +++ b/cloudflare-gastown/container/plugin/client.test.ts @@ -7,6 +7,7 @@ const TEST_ENV: GastownEnv = { sessionToken: 'test-jwt-token', agentId: 'agent-111', rigId: 'rig-222', + townId: 'town-333', }; function mockFetch(data: unknown, status = 200) { @@ -48,7 +49,9 @@ describe('GastownClient', () => { expect(fetchMock).toHaveBeenCalledTimes(1); const [url, init] = fetchMock.mock.calls[0] as [string, RequestInit]; - expect(url).toBe('https://gastown.example.com/api/rigs/rig-222/agents/agent-111/prime'); + expect(url).toBe( + 'https://gastown.example.com/api/towns/town-333/rigs/rig-222/agents/agent-111/prime' + ); const headers = new Headers(init.headers); expect(headers.get('Authorization')).toBe('Bearer test-jwt-token'); expect(headers.get('Content-Type')).toBe('application/json'); @@ -81,7 +84,7 @@ describe('GastownClient', () => { expect(result).toEqual(bead); const [url] = (globalThis.fetch as ReturnType).mock.calls[0] as [string]; - expect(url).toBe('https://gastown.example.com/api/rigs/rig-222/beads/bead-1'); + expect(url).toBe('https://gastown.example.com/api/towns/town-333/rigs/rig-222/beads/bead-1'); }); it('closeBead() sends agent_id in body', async () => { @@ -94,7 +97,9 @@ describe('GastownClient', () => { string, RequestInit, ]; - expect(url).toBe('https://gastown.example.com/api/rigs/rig-222/beads/bead-1/close'); + expect(url).toBe( + 'https://gastown.example.com/api/towns/town-333/rigs/rig-222/beads/bead-1/close' + ); expect(init.method).toBe('POST'); expect(JSON.parse(init.body as string)).toEqual({ agent_id: 'agent-111' }); }); @@ -112,7 +117,9 @@ describe('GastownClient', () => { string, RequestInit, ]; - expect(url).toBe('https://gastown.example.com/api/rigs/rig-222/agents/agent-111/done'); + expect(url).toBe( + 'https://gastown.example.com/api/towns/town-333/rigs/rig-222/agents/agent-111/done' + ); expect(JSON.parse(init.body as string)).toEqual({ branch: 'feat/test', pr_url: 'https://github.com/pr/1', @@ -145,7 +152,9 @@ describe('GastownClient', () => { expect(result).toEqual(mail); const [url] = (globalThis.fetch as ReturnType).mock.calls[0] as [string]; - expect(url).toBe('https://gastown.example.com/api/rigs/rig-222/agents/agent-111/mail'); + expect(url).toBe( + 'https://gastown.example.com/api/towns/town-333/rigs/rig-222/agents/agent-111/mail' + ); }); it('writeCheckpoint() posts data to checkpoint endpoint', async () => { @@ -157,7 +166,9 @@ describe('GastownClient', () => { string, RequestInit, ]; - expect(url).toBe('https://gastown.example.com/api/rigs/rig-222/agents/agent-111/checkpoint'); + expect(url).toBe( + 'https://gastown.example.com/api/towns/town-333/rigs/rig-222/agents/agent-111/checkpoint' + ); expect(JSON.parse(init.body as string)).toEqual({ data: { step: 3, files: ['a.ts'] } }); }); @@ -172,7 +183,7 @@ describe('GastownClient', () => { string, RequestInit, ]; - expect(url).toBe('https://gastown.example.com/api/rigs/rig-222/escalations'); + expect(url).toBe('https://gastown.example.com/api/towns/town-333/rigs/rig-222/escalations'); expect(JSON.parse(init.body as string)).toEqual({ title: 'blocked', priority: 'high' }); }); @@ -246,7 +257,9 @@ describe('GastownClient', () => { // Verify no double slashes in the URL by calling prime void c.prime(); const [url] = (globalThis.fetch as ReturnType).mock.calls[0] as [string]; - expect(url).toBe('https://gastown.example.com/api/rigs/rig-222/agents/agent-111/prime'); + expect(url).toBe( + 'https://gastown.example.com/api/towns/town-333/rigs/rig-222/agents/agent-111/prime' + ); }); }); @@ -262,6 +275,7 @@ describe('createClientFromEnv', () => { process.env.GASTOWN_SESSION_TOKEN = 'tok'; process.env.GASTOWN_AGENT_ID = 'agent-1'; process.env.GASTOWN_RIG_ID = 'rig-1'; + process.env.GASTOWN_TOWN_ID = 'town-1'; const client = createClientFromEnv(); expect(client).toBeInstanceOf(GastownClient); diff --git a/cloudflare-gastown/container/plugin/client.ts b/cloudflare-gastown/container/plugin/client.ts index 1627c2a16f..daf504f3e8 100644 --- a/cloudflare-gastown/container/plugin/client.ts +++ b/cloudflare-gastown/container/plugin/client.ts @@ -26,12 +26,14 @@ function isApiResponse( export class GastownClient { private baseUrl: string; + private containerToken: string | undefined; private token: string; private agentId: string; private rigId: string; private townId: string; constructor(env: GastownEnv) { this.baseUrl = env.apiUrl.replace(/\/+$/, ''); + this.containerToken = env.containerToken; this.token = env.sessionToken; this.agentId = env.agentId; this.rigId = env.rigId; @@ -50,7 +52,18 @@ export class GastownClient { // Normalize headers so callers can pass plain objects, Headers instances, or tuples const headers = new Headers(init?.headers); headers.set('Content-Type', 'application/json'); - headers.set('Authorization', `Bearer ${this.token}`); + // Prefer the live container token from process.env (refreshed by the + // TownDO alarm via POST /refresh-token), then the token captured at + // init, then the legacy per-agent JWT. + const authToken = process.env.GASTOWN_CONTAINER_TOKEN ?? this.containerToken ?? this.token; + headers.set('Authorization', `Bearer ${authToken}`); + // When using a container-scoped JWT, send agent identity headers so + // the auth middleware can populate agentId/rigId on routes that don't + // have :agentId/:rigId params (e.g. /triage/resolve, /mail). + if (process.env.GASTOWN_CONTAINER_TOKEN || this.containerToken) { + headers.set('X-Gastown-Agent-Id', this.agentId); + headers.set('X-Gastown-Rig-Id', this.rigId); + } let response: Response; try { @@ -193,16 +206,18 @@ export class GastownClient { /** * Mayor-scoped client for town-level cross-rig operations. - * Uses `/api/mayor/:townId/tools/*` routes authenticated via townId-scoped JWT. + * Uses `/api/mayor/:townId/tools/*` routes authenticated via container secret or JWT. */ export class MayorGastownClient { private baseUrl: string; + private containerToken: string | undefined; private token: string; private agentId: string; private townId: string; constructor(env: MayorGastownEnv) { this.baseUrl = env.apiUrl.replace(/\/+$/, ''); + this.containerToken = env.containerToken; this.token = env.sessionToken; this.agentId = env.agentId; this.townId = env.townId; @@ -215,7 +230,13 @@ export class MayorGastownClient { private async request(url: string, init?: RequestInit): Promise { const headers = new Headers(init?.headers); headers.set('Content-Type', 'application/json'); - headers.set('Authorization', `Bearer ${this.token}`); + // Prefer live container token (refreshed via POST /refresh-token), + // then init-time token, then legacy per-agent JWT. + const authToken = process.env.GASTOWN_CONTAINER_TOKEN ?? this.containerToken ?? this.token; + headers.set('Authorization', `Bearer ${authToken}`); + if (process.env.GASTOWN_CONTAINER_TOKEN || this.containerToken) { + headers.set('X-Gastown-Agent-Id', this.agentId); + } let response: Response; try { @@ -334,15 +355,18 @@ export class GastownApiError extends Error { export function createClientFromEnv(): GastownClient { const apiUrl = process.env.GASTOWN_API_URL; + const containerToken = process.env.GASTOWN_CONTAINER_TOKEN; const sessionToken = process.env.GASTOWN_SESSION_TOKEN; const agentId = process.env.GASTOWN_AGENT_ID; const rigId = process.env.GASTOWN_RIG_ID; const townId = process.env.GASTOWN_TOWN_ID; - if (!apiUrl || !sessionToken || !agentId || !rigId || !townId) { + // Require either containerToken or sessionToken (prefer containerToken) + const hasAuth = containerToken || sessionToken; + if (!apiUrl || !hasAuth || !agentId || !rigId || !townId) { const missing = [ !apiUrl && 'GASTOWN_API_URL', - !sessionToken && 'GASTOWN_SESSION_TOKEN', + !hasAuth && 'GASTOWN_CONTAINER_TOKEN or GASTOWN_SESSION_TOKEN', !agentId && 'GASTOWN_AGENT_ID', !rigId && 'GASTOWN_RIG_ID', !townId && 'GASTOWN_TOWN_ID', @@ -350,24 +374,39 @@ export function createClientFromEnv(): GastownClient { throw new Error(`Missing required Gastown environment variables: ${missing.join(', ')}`); } - return new GastownClient({ apiUrl, sessionToken, agentId, rigId, townId }); + return new GastownClient({ + apiUrl, + containerToken: containerToken ?? undefined, + sessionToken: sessionToken ?? '', + agentId, + rigId, + townId, + }); } export function createMayorClientFromEnv(): MayorGastownClient { const apiUrl = process.env.GASTOWN_API_URL; + const containerToken = process.env.GASTOWN_CONTAINER_TOKEN; const sessionToken = process.env.GASTOWN_SESSION_TOKEN; const agentId = process.env.GASTOWN_AGENT_ID; const townId = process.env.GASTOWN_TOWN_ID; - if (!apiUrl || !sessionToken || !agentId || !townId) { + const hasAuth = containerToken || sessionToken; + if (!apiUrl || !hasAuth || !agentId || !townId) { const missing = [ !apiUrl && 'GASTOWN_API_URL', - !sessionToken && 'GASTOWN_SESSION_TOKEN', + !hasAuth && 'GASTOWN_CONTAINER_TOKEN or GASTOWN_SESSION_TOKEN', !agentId && 'GASTOWN_AGENT_ID', !townId && 'GASTOWN_TOWN_ID', ].filter(Boolean); throw new Error(`Missing required mayor environment variables: ${missing.join(', ')}`); } - return new MayorGastownClient({ apiUrl, sessionToken, agentId, townId }); + return new MayorGastownClient({ + apiUrl, + containerToken: containerToken ?? undefined, + sessionToken: sessionToken ?? '', + agentId, + townId, + }); } diff --git a/cloudflare-gastown/container/plugin/types.ts b/cloudflare-gastown/container/plugin/types.ts index 2a2ad85a93..7f83b5e132 100644 --- a/cloudflare-gastown/container/plugin/types.ts +++ b/cloudflare-gastown/container/plugin/types.ts @@ -119,6 +119,9 @@ export type ConvoyDetail = Convoy & { // Environment variable config for the plugin (rig-scoped agents) export type GastownEnv = { apiUrl: string; + /** Container-scoped JWT (shared by all agents, refreshed by alarm). */ + containerToken?: string; + /** Legacy per-agent JWT (8h expiry) — fallback during rollout. */ sessionToken: string; agentId: string; rigId: string; @@ -128,6 +131,9 @@ export type GastownEnv = { // Environment variable config for the mayor (town-scoped) export type MayorGastownEnv = { apiUrl: string; + /** Container-scoped JWT (shared by all agents, refreshed by alarm). */ + containerToken?: string; + /** Legacy per-agent JWT (8h expiry) — fallback during rollout. */ sessionToken: string; agentId: string; townId: string; diff --git a/cloudflare-gastown/container/src/agent-runner.ts b/cloudflare-gastown/container/src/agent-runner.ts index 803b46ba0a..ec9c0fa8a5 100644 --- a/cloudflare-gastown/container/src/agent-runner.ts +++ b/cloudflare-gastown/container/src/agent-runner.ts @@ -91,7 +91,12 @@ function buildAgentEnv(request: StartAgentRequest): Record { // the request or the container's own environment. // (KILO_API_URL and KILO_OPENROUTER_BASE are set at the container level // via TownContainerDO.envVars and inherited through process.env.) - const conditionalKeys = ['GASTOWN_API_URL', 'GASTOWN_SESSION_TOKEN', 'KILOCODE_TOKEN']; + const conditionalKeys = [ + 'GASTOWN_API_URL', + 'GASTOWN_CONTAINER_TOKEN', + 'GASTOWN_SESSION_TOKEN', + 'KILOCODE_TOKEN', + ]; for (const key of conditionalKeys) { const value = resolveEnv(request, key); if (value) { @@ -328,24 +333,23 @@ async function verifyGitCredentials( } /** - * Create a minimal git-initialized workspace for the mayor agent. - * The mayor doesn't need a real repo clone — it's a conversational - * orchestrator that delegates work via tools. But kilo serve requires - * a git repo in the working directory. + * Create a minimal git-initialized workspace for a reasoning-only agent + * (e.g. triage) that doesn't need a real repo clone. + * kilo serve requires a git repo in the working directory, so we init + * a bare local repo with an empty initial commit. */ -async function createMayorWorkspace(rigId: string): Promise { +async function createLightweightWorkspace(label: string, rigId: string): Promise { const { mkdir: mkdirAsync } = await import('node:fs/promises'); const { existsSync } = await import('node:fs'); const path = await import('node:path'); - // Validate rigId to prevent path traversal (rigId is synthetic: "mayor-") + // Validate to prevent path traversal // eslint-disable-next-line no-control-regex if (!rigId || /\.\.[/\\]|[/\\]\.\.|^\.\.$/.test(rigId) || /[\x00-\x1f]/.test(rigId)) { - throw new Error(`Invalid rigId for mayor workspace: ${rigId}`); + throw new Error(`Invalid rigId for lightweight workspace: ${rigId}`); } - const dir = path.resolve('/workspace/rigs', rigId, 'mayor-workspace'); + const dir = path.resolve('/workspace/rigs', rigId, `${label}-workspace`); await mkdirAsync(dir, { recursive: true }); - // Initialize a bare git repo if not already present if (!existsSync(`${dir}/.git`)) { const init = Bun.spawn(['git', 'init'], { cwd: dir, stdout: 'pipe', stderr: 'pipe' }); await init.exited; @@ -355,12 +359,22 @@ async function createMayorWorkspace(rigId: string): Promise { stderr: 'pipe', }); await commit.exited; - console.log(`Created mayor workspace at ${dir}`); + console.log(`Created ${label} workspace at ${dir}`); } return dir; } +/** + * Create a minimal git-initialized workspace for the mayor agent. + * The mayor doesn't need a real repo clone — it's a conversational + * orchestrator that delegates work via tools. But kilo serve requires + * a git repo in the working directory. + */ +async function createMayorWorkspace(rigId: string): Promise { + return createLightweightWorkspace('mayor', rigId); +} + /** * Write the mayor's system prompt to AGENTS.md in the workspace. * @@ -415,7 +429,7 @@ async function writeMayorSystemPromptToAgentsMd( /** * Run the full agent startup sequence: - * 1. Clone/fetch the rig's git repo (or create minimal workspace for mayor) + * 1. Clone/fetch the rig's git repo (or create minimal workspace for mayor/triage) * 2. Create an isolated worktree for the agent's branch * 3. Configure git credentials for push/fetch * 4. Start a kilo serve instance for the worktree (or reuse existing) @@ -425,7 +439,11 @@ export async function runAgent(originalRequest: StartAgentRequest): Promise { const apiUrl = agent.gastownApiUrl; - const token = agent.gastownSessionToken; - if (!apiUrl || !token) { + // Prefer live container token (refreshed via POST /refresh-token) + const authToken = + process.env.GASTOWN_CONTAINER_TOKEN ?? agent.gastownContainerToken ?? agent.gastownSessionToken; + if (!apiUrl || !authToken) { console.warn( `Cannot report agent ${agent.agentId} completion: no API credentials on agent record` ); @@ -29,12 +31,14 @@ export async function reportAgentCompleted( agent.completionCallbackUrl ?? `${apiUrl}/api/towns/${agent.townId}/rigs/${agent.rigId}/agents/${agent.agentId}/completed`; try { + const headers: Record = { + 'Content-Type': 'application/json', + Authorization: `Bearer ${authToken}`, + }; + const response = await fetch(url, { method: 'POST', - headers: { - 'Content-Type': 'application/json', - Authorization: `Bearer ${token}`, - }, + headers, body: JSON.stringify({ status, reason, agentId: agent.agentId }), }); diff --git a/cloudflare-gastown/container/src/control-server.ts b/cloudflare-gastown/container/src/control-server.ts index d86461156b..14c6b56fcd 100644 --- a/cloudflare-gastown/container/src/control-server.ts +++ b/cloudflare-gastown/container/src/control-server.ts @@ -93,6 +93,20 @@ app.get('/health', c => { return c.json(response); }); +// POST /refresh-token +// Hot-swap the container-scoped JWT on the running process. Called by +// the TownDO alarm to push a fresh token before the current one expires. +// Updates process.env so all subsequent API calls use the new token. +app.post('/refresh-token', async c => { + const body: unknown = await c.req.json().catch(() => null); + if (!body || typeof body !== 'object' || !('token' in body) || typeof body.token !== 'string') { + return c.json({ error: 'Missing or invalid token field' }, 400); + } + process.env.GASTOWN_CONTAINER_TOKEN = body.token; + console.log('[control-server] Container token refreshed'); + return c.json({ refreshed: true }); +}); + // POST /agents/start app.post('/agents/start', async c => { const body: unknown = await c.req.json().catch(() => null); @@ -113,8 +127,13 @@ app.post('/agents/start', async c => { `[control-server] /agents/start: success agentId=${agent.agentId} port=${agent.serverPort} session=${agent.sessionId}` ); // Strip sensitive fields before returning — the caller only needs - // agent metadata, not the internal session token or API URL. - const { gastownSessionToken: _, gastownApiUrl: _url, ...safeAgent } = agent; + // agent metadata, not the internal tokens or API URL. + const { + gastownSessionToken: _, + gastownContainerToken: _ct, + gastownApiUrl: _url, + ...safeAgent + } = agent; return c.json(safeAgent, 201); } catch (err) { const message = err instanceof Error ? err.message : String(err); @@ -292,7 +311,12 @@ app.post('/git/merge', async c => { // Run the merge in the background so we can return 202 immediately. // The Rig DO will be notified via callback when the merge completes. const apiUrl = req.envVars?.GASTOWN_API_URL ?? process.env.GASTOWN_API_URL; - const token = req.envVars?.GASTOWN_SESSION_TOKEN ?? process.env.GASTOWN_SESSION_TOKEN; + // Prefer container secret (no expiry) over session token (8h JWT) + const token = + req.envVars?.GASTOWN_CONTAINER_TOKEN ?? + process.env.GASTOWN_CONTAINER_TOKEN ?? + req.envVars?.GASTOWN_SESSION_TOKEN ?? + process.env.GASTOWN_SESSION_TOKEN; const doMerge = async () => { const outcome = await mergeBranch({ @@ -515,11 +539,12 @@ app.onError((err, c) => { export function startControlServer(): void { const PORT = 8080; - // Start heartbeat if env vars are configured + // Start heartbeat if env vars are configured. + // Prefer container secret (no expiry) over session token (8h JWT). const apiUrl = process.env.GASTOWN_API_URL; - const sessionToken = process.env.GASTOWN_SESSION_TOKEN; - if (apiUrl && sessionToken) { - startHeartbeat(apiUrl, sessionToken); + const authToken = process.env.GASTOWN_CONTAINER_TOKEN ?? process.env.GASTOWN_SESSION_TOKEN; + if (apiUrl && authToken) { + startHeartbeat(apiUrl, authToken); } // Handle graceful shutdown diff --git a/cloudflare-gastown/container/src/heartbeat.ts b/cloudflare-gastown/container/src/heartbeat.ts index b09531207b..8d78c31ff6 100644 --- a/cloudflare-gastown/container/src/heartbeat.ts +++ b/cloudflare-gastown/container/src/heartbeat.ts @@ -39,7 +39,10 @@ export function stopHeartbeat(): void { } async function sendHeartbeats(): Promise { - if (!gastownApiUrl || !sessionToken) return; + // Prefer the live container token (refreshed via POST /refresh-token) + // over the token captured at startHeartbeat() time. + const currentToken = process.env.GASTOWN_CONTAINER_TOKEN ?? sessionToken; + if (!gastownApiUrl || !currentToken) return; const active = listAgents().filter(a => a.status === 'running' || a.status === 'starting'); @@ -53,14 +56,15 @@ async function sendHeartbeats(): Promise { }; try { + const headers: Record = { + 'Content-Type': 'application/json', + Authorization: `Bearer ${currentToken}`, + }; const response = await fetch( `${gastownApiUrl}/api/towns/${agent.townId}/rigs/${agent.rigId}/agents/${agent.agentId}/heartbeat`, { method: 'POST', - headers: { - 'Content-Type': 'application/json', - Authorization: `Bearer ${sessionToken}`, - }, + headers, body: JSON.stringify(payload), } ); diff --git a/cloudflare-gastown/container/src/process-manager.ts b/cloudflare-gastown/container/src/process-manager.ts index ea4afcfda9..a0c70fce4e 100644 --- a/cloudflare-gastown/container/src/process-manager.ts +++ b/cloudflare-gastown/container/src/process-manager.ts @@ -97,16 +97,29 @@ function broadcastEvent(agentId: string, event: string, data: unknown): void { // Persist to AgentDO via the worker (fire-and-forget) const agent = agents.get(agentId); - if (agent?.gastownApiUrl && agent.gastownSessionToken) { + // Prefer live container token (refreshed via POST /refresh-token), + // then the per-agent cached token, then the legacy session token. + const authToken = + process.env.GASTOWN_CONTAINER_TOKEN ?? + agent?.gastownContainerToken ?? + agent?.gastownSessionToken; + if (agent?.gastownApiUrl && authToken) { + const headers: Record = { + 'Content-Type': 'application/json', + Authorization: `Bearer ${authToken}`, + }; + // When using a container JWT, send agent identity so the handler's + // getEnforcedAgentId() ownership check still works. + if (process.env.GASTOWN_CONTAINER_TOKEN || agent.gastownContainerToken) { + headers['X-Gastown-Agent-Id'] = agentId; + if (agent.rigId) headers['X-Gastown-Rig-Id'] = agent.rigId; + } // POST to the worker's agent-events endpoint for persistent storage fetch( `${agent.gastownApiUrl}/api/towns/${agent.townId ?? '_'}/rigs/${agent.rigId ?? '_'}/agent-events`, { method: 'POST', - headers: { - 'Content-Type': 'application/json', - Authorization: `Bearer ${agent.gastownSessionToken}`, - }, + headers, body: JSON.stringify({ agent_id: agentId, event_type: event, @@ -315,6 +328,8 @@ export async function startAgent( messageCount: 0, exitReason: null, gastownApiUrl: request.envVars?.GASTOWN_API_URL ?? process.env.GASTOWN_API_URL ?? null, + gastownContainerToken: + request.envVars?.GASTOWN_CONTAINER_TOKEN ?? process.env.GASTOWN_CONTAINER_TOKEN ?? null, gastownSessionToken: request.envVars?.GASTOWN_SESSION_TOKEN ?? null, completionCallbackUrl: request.envVars?.GASTOWN_COMPLETION_CALLBACK_URL ?? null, model: request.model ?? null, diff --git a/cloudflare-gastown/container/src/types.ts b/cloudflare-gastown/container/src/types.ts index 7da0d95778..b47b485e72 100644 --- a/cloudflare-gastown/container/src/types.ts +++ b/cloudflare-gastown/container/src/types.ts @@ -2,7 +2,7 @@ import { z } from 'zod'; // ── Agent roles (mirrors worker types) ────────────────────────────────── -export const AgentRole = z.enum(['mayor', 'polecat', 'refinery']); +export const AgentRole = z.enum(['mayor', 'polecat', 'refinery', 'triage']); export type AgentRole = z.infer; // ── Control server request/response schemas ───────────────────────────── @@ -107,7 +107,9 @@ export type ManagedAgent = { exitReason: string | null; /** Gastown worker API URL for completion callbacks */ gastownApiUrl: string | null; - /** Agent-scoped JWT for authenticating callbacks to the Gastown worker */ + /** Container-scoped JWT (shared by all agents, refreshed by alarm). */ + gastownContainerToken: string | null; + /** Legacy per-agent JWT for authenticating callbacks to the Gastown worker. */ gastownSessionToken: string | null; /** Override the default completion callback URL (for agents not backed by a Rig DO) */ completionCallbackUrl: string | null; diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index 6a7e719cd3..4f34fde7c6 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -1964,6 +1964,17 @@ export class TownDO extends DurableObject { } } + // Refresh the container-scoped JWT before any work that might + // trigger API calls. Throttled to once per hour (tokens have 8h + // expiry, so hourly refresh provides ample safety margin). + if (this.hasActiveWork()) { + try { + await this.refreshContainerToken(); + } catch (err) { + console.warn(`${TOWN_LOG} alarm: refreshContainerToken failed`, err); + } + } + // Process reviews FIRST so the refinery gets assigned before the // scheduler dispatches new polecats. This prevents downstream beads // from starting before upstream reviews are merged. @@ -2022,6 +2033,28 @@ export class TownDO extends DurableObject { } } + /** + * Push a fresh container-scoped JWT to the TownContainerDO. Called + * from the alarm handler, throttled to once per hour (tokens have + * 8h expiry). The TownContainerDO stores it as an env var so it's + * available to all agents in the container. + */ + private lastContainerTokenRefreshAt = 0; + private async refreshContainerToken(): Promise { + const TOKEN_REFRESH_INTERVAL_MS = 60 * 60_000; // 1 hour + const now = Date.now(); + if (now - this.lastContainerTokenRefreshAt < TOKEN_REFRESH_INTERVAL_MS) return; + + const townId = this.townId; + if (!townId) return; + const townConfig = await this.getTownConfig(); + const userId = townConfig.owner_user_id ?? townId; + await dispatch.refreshContainerToken(this.env, townId, userId); + // Only mark as refreshed after success — failed refreshes should + // be retried on the next alarm tick, not throttled for an hour. + this.lastContainerTokenRefreshAt = now; + } + private hasActiveWork(): boolean { const activeAgentRows = [ ...query( @@ -2500,7 +2533,9 @@ export class TownDO extends DurableObject { userId: rigConfig.userId, agentId: triageAgent.id, agentName: triageAgent.name, - role: 'polecat', + // Use 'triage' role so the container skips the git clone entirely. + // Triage work is purely reasoning — no code changes needed. + role: 'triage', identity: triageAgent.identity, beadId: triageBead.bead_id, beadTitle: triageBead.title, @@ -2521,6 +2556,18 @@ export class TownDO extends DurableObject { } else { agents.unhookBead(this.sql, triageAgent.id); beadOps.updateBeadStatus(this.sql, triageBead.bead_id, 'failed', triageAgent.id); + // Apply dispatch cooldown so the next alarm tick doesn't immediately + // retry. Setting last_activity_at = now() makes the agent invisible + // to schedulePendingWork for DISPATCH_COOLDOWN_MS (2 min). + query( + this.sql, + /* sql */ ` + UPDATE ${agent_metadata} + SET ${agent_metadata.columns.last_activity_at} = ? + WHERE ${agent_metadata.bead_id} = ? + `, + [now(), triageAgent.id] + ); console.error(`${TOWN_LOG} maybeDispatchTriageAgent: triage agent failed to start`); } } diff --git a/cloudflare-gastown/src/dos/town/beads.ts b/cloudflare-gastown/src/dos/town/beads.ts index 846ff632f7..9e6581b451 100644 --- a/cloudflare-gastown/src/dos/town/beads.ts +++ b/cloudflare-gastown/src/dos/town/beads.ts @@ -226,7 +226,7 @@ export function updateBeadStatus( * recount closed beads and update convoy_metadata. Auto-lands the * convoy when all tracked beads are closed. */ -function updateConvoyProgress(sql: SqlStorage, beadId: string, timestamp: string): void { +export function updateConvoyProgress(sql: SqlStorage, beadId: string, timestamp: string): void { const convoyRows = [ ...query( sql, @@ -667,11 +667,18 @@ export function getConvoyDependencyEdges( } /** - * Find the convoy a bead belongs to (if any) via 'tracks' dependencies. - * Returns the convoy bead_id or null. + * Find the convoy a bead belongs to (if any). + * + * Two cases: + * 1. Normal source bead: tracked by a convoy via bead_dependencies + * (bead_id = sourceBeadId, depends_on_bead_id = convoyId, type = 'tracks'). + * Returns the convoy bead_id. + * 2. The bead IS the convoy (e.g. for the final landing MR where processConvoyLandings + * passes the convoy bead_id as the source). Returns beadId itself. */ export function getConvoyForBead(sql: SqlStorage, beadId: string): string | null { - const rows = [ + // Case 1: bead is tracked by a convoy + const trackRows = [ ...query( sql, /* sql */ ` @@ -683,8 +690,24 @@ export function getConvoyForBead(sql: SqlStorage, beadId: string): string | null [beadId] ), ]; - if (rows.length === 0) return null; - return z.object({ depends_on_bead_id: z.string() }).parse(rows[0]).depends_on_bead_id; + if (trackRows.length > 0) { + return z.object({ depends_on_bead_id: z.string() }).parse(trackRows[0]).depends_on_bead_id; + } + + // Case 2: bead is itself a convoy (has convoy_metadata) + const metaRows = [ + ...query( + sql, + /* sql */ ` + SELECT 1 FROM ${convoy_metadata} + WHERE ${convoy_metadata.bead_id} = ? + `, + [beadId] + ), + ]; + if (metaRows.length > 0) return beadId; + + return null; } /** diff --git a/cloudflare-gastown/src/dos/town/container-dispatch.ts b/cloudflare-gastown/src/dos/town/container-dispatch.ts index 841ae4e713..51c6b67ed7 100644 --- a/cloudflare-gastown/src/dos/town/container-dispatch.ts +++ b/cloudflare-gastown/src/dos/town/container-dispatch.ts @@ -4,7 +4,7 @@ */ import { getTownContainerStub } from '../TownContainer.do'; -import { signAgentJWT } from '../../util/jwt.util'; +import { signAgentJWT, signContainerJWT } from '../../util/jwt.util'; import { buildPolecatSystemPrompt } from '../../prompts/polecat-system.prompt'; import { buildMayorSystemPrompt } from '../../prompts/mayor-system.prompt'; import type { TownConfig } from '../../types'; @@ -41,6 +41,9 @@ export async function resolveJWTSecret(env: Env): Promise { /** * Mint a short-lived agent JWT for the given agent to authenticate * API calls back to the gastown worker. + * + * @deprecated Prefer container secrets (ensureContainerSecret) for new code. + * Agent JWTs are retained for backwards compatibility during rollout. */ export async function mintAgentToken( env: Env, @@ -57,6 +60,73 @@ export async function mintAgentToken( ); } +/** + * Mint a container-scoped JWT and push it to the TownContainerDO. + * One JWT per container — shared by all agents in the town. Carries + * { townId, userId, scope: 'container' } with 8h expiry. + * + * Pushes via both setEnvVar() (for next container boot) and + * POST /refresh-token (for the running process). This ensures that + * all code paths — existing agents, heartbeat, event persistence — + * pick up the fresh token immediately. + * + * Returns the token so callers can also pass it as a per-agent env var. + */ +export async function ensureContainerToken( + env: Env, + townId: string, + userId: string +): Promise { + const jwtSecret = await resolveJWTSecret(env); + if (!jwtSecret) { + console.error(`${TOWN_LOG} ensureContainerToken: no JWT secret available`); + return null; + } + + const token = signContainerJWT({ townId, userId }, jwtSecret); + const container = getTownContainerStub(env, townId); + + // Store for next boot + try { + await container.setEnvVar('GASTOWN_CONTAINER_TOKEN', token); + } catch (err) { + console.warn( + `${TOWN_LOG} ensureContainerToken: setEnvVar failed (container may not be running):`, + err instanceof Error ? err.message : err + ); + } + + // Push to running process so existing agents pick up the fresh token. + // Throw on non-2xx so the alarm's throttle doesn't advance on failure. + try { + const resp = await container.fetch('http://container/refresh-token', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ token }), + }); + if (!resp.ok) { + throw new Error(`container returned ${resp.status}`); + } + } catch (err) { + // If the container isn't running yet, the token will be in envVars + // when it boots. But if it IS running and rejected the refresh, + // propagate the error so the alarm retries on the next tick. + const isContainerDown = + err instanceof TypeError || (err instanceof Error && err.message.includes('fetch')); + if (!isContainerDown) throw err; + } + + return token; +} + +/** + * Alias for ensureContainerToken — both functions now push to the + * running container process via POST /refresh-token. Kept as a + * separate export for call-site readability (alarm code calls + * "refresh", dispatch code calls "ensure"). + */ +export const refreshContainerToken = ensureContainerToken; + /** Build the initial prompt for an agent from its bead. */ export function buildPrompt(params: { beadTitle: string; @@ -183,17 +253,23 @@ export async function startAgentInContainer( `${TOWN_LOG} startAgentInContainer: agentId=${params.agentId} role=${params.role} name=${params.agentName}` ); try { - const token = await mintAgentToken(env, { + // Mint a container-scoped JWT (8h expiry, refreshed by TownDO alarm). + // One token per container — shared by all agents in the town. + // Carries { townId, userId, scope: 'container' }. + const containerToken = await ensureContainerToken(env, params.townId, params.userId); + + // Also mint a per-agent JWT as fallback during rollout. + const agentToken = await mintAgentToken(env, { agentId: params.agentId, rigId: params.rigId, townId: params.townId, userId: params.userId, }); - if (!token) { + if (!containerToken && !agentToken) { console.error( - `${TOWN_LOG} startAgentInContainer: ABORTING — failed to mint JWT for agent ${params.agentId}. ` + - 'The agent would start without GASTOWN_SESSION_TOKEN and be unable to call back to the worker.' + `${TOWN_LOG} startAgentInContainer: ABORTING — failed to mint any auth token for agent ${params.agentId}. ` + + 'The agent would start without credentials and be unable to call back to the worker.' ); return false; } @@ -212,13 +288,16 @@ export async function startAgentInContainer( envVars.GITLAB_INSTANCE_URL = params.townConfig.git_auth.gitlab_instance_url; } - if (token) envVars.GASTOWN_SESSION_TOKEN = token; + // Container token is preferred (shared by all agents, refreshed by alarm). + // Legacy per-agent JWT kept as fallback during rollout. + if (containerToken) envVars.GASTOWN_CONTAINER_TOKEN = containerToken; + if (agentToken) envVars.GASTOWN_SESSION_TOKEN = agentToken; // kilocodeToken: prefer rig-level, fall back to town config const kilocodeToken = params.kilocodeToken ?? params.townConfig.kilocode_token; if (kilocodeToken) envVars.KILOCODE_TOKEN = kilocodeToken; console.log( - `${TOWN_LOG} startAgentInContainer: envVars built: keys=[${Object.keys(envVars).join(',')}] hasGitToken=${!!envVars.GIT_TOKEN} hasGitlabToken=${!!envVars.GITLAB_TOKEN} hasJwt=${!!token} hasKilocodeToken=${!!kilocodeToken} git_auth_keys=[${Object.keys(params.townConfig.git_auth ?? {}).join(',')}]` + `${TOWN_LOG} startAgentInContainer: envVars built: keys=[${Object.keys(envVars).join(',')}] hasGitToken=${!!envVars.GIT_TOKEN} hasGitlabToken=${!!envVars.GITLAB_TOKEN} hasContainerToken=${!!containerToken} hasAgentJwt=${!!agentToken} hasKilocodeToken=${!!kilocodeToken} git_auth_keys=[${Object.keys(params.townConfig.git_auth ?? {}).join(',')}]` ); const containerConfig = await buildContainerConfig(storage, env); @@ -302,13 +381,23 @@ export async function startMergeInContainer( } ): Promise { try { - const token = await mintAgentToken(env, { + const userId = params.townConfig.owner_user_id ?? params.townId; + const containerToken = await ensureContainerToken(env, params.townId, userId); + const agentToken = await mintAgentToken(env, { agentId: params.agentId, rigId: params.rigId, townId: params.townId, - userId: params.townConfig.owner_user_id ?? '', + userId, }); + if (!containerToken && !agentToken) { + console.error( + `${TOWN_LOG} startMergeInContainer: ABORTING — failed to mint any auth token for merge entry ${params.entryId}. ` + + 'The merge process would start without credentials and be unable to report results.' + ); + return false; + } + const envVars: Record = { ...(params.townConfig.env_vars ?? {}) }; if (params.townConfig.git_auth?.github_token) { envVars.GIT_TOKEN = params.townConfig.git_auth.github_token; @@ -319,7 +408,8 @@ export async function startMergeInContainer( if (params.townConfig.git_auth?.gitlab_instance_url) { envVars.GITLAB_INSTANCE_URL = params.townConfig.git_auth.gitlab_instance_url; } - if (token) envVars.GASTOWN_SESSION_TOKEN = token; + if (containerToken) envVars.GASTOWN_CONTAINER_TOKEN = containerToken; + if (agentToken) envVars.GASTOWN_SESSION_TOKEN = agentToken; if (env.GASTOWN_API_URL) envVars.GASTOWN_API_URL = env.GASTOWN_API_URL; const mergeKilocodeToken = params.kilocodeToken ?? params.townConfig.kilocode_token; if (mergeKilocodeToken) envVars.KILOCODE_TOKEN = mergeKilocodeToken; diff --git a/cloudflare-gastown/src/dos/town/patrol.ts b/cloudflare-gastown/src/dos/town/patrol.ts index 9be22cb8a6..e8cc2de374 100644 --- a/cloudflare-gastown/src/dos/town/patrol.ts +++ b/cloudflare-gastown/src/dos/town/patrol.ts @@ -37,6 +37,8 @@ export const STALE_HOOK_MS = 30 * 60_000; // 30 min export const CRASH_LOOP_WINDOW_MS = 30 * 60_000; // 30 min /** Minimum failures within the window to flag a crash loop */ export const CRASH_LOOP_THRESHOLD = 3; +/** Maximum number of open triage request beads allowed at once */ +export const MAX_OPEN_TRIAGE_REQUESTS = 5; // ── Triage request types ──────────────────────────────────────────── @@ -105,6 +107,28 @@ export function createTriageRequest( if (existing.length > 0) return; } + // Global cap: skip if there are already too many open triage requests. + // Prevents unbounded accumulation during feedback loops. + const openCountRows = [ + ...query( + sql, + /* sql */ ` + SELECT COUNT(*) AS cnt FROM ${beads} + WHERE ${beads.type} = 'issue' + AND ${beads.labels} LIKE ? + AND ${beads.status} = 'open' + `, + [TRIAGE_LABEL_LIKE] + ), + ]; + const openCount = Number(z.object({ cnt: z.number() }).parse(openCountRows[0]).cnt); + if (openCount >= MAX_OPEN_TRIAGE_REQUESTS) { + console.warn( + `${LOG} createTriageRequest: global cap reached (${openCount} open), skipping type=${params.triageType}` + ); + return; + } + const metadata: TriageRequestMetadata = { triage_type: params.triageType, agent_bead_id: params.agentBeadId, @@ -559,6 +583,12 @@ export function detectCrashLoops(sql: SqlStorage): void { fail_count: z.number(), }); + // Exclude triage agents from crash loop detection — their failures must + // not create new triage requests, which would feed the feedback loop. + // An agent is considered a triage agent if its current hooked bead has + // the gt:triage or gt:triage-request label (both start with "gt:triage"). + const TRIAGE_LABEL_ANY = `%"gt:triage%`; + const rows = CrashRow.array().parse([ ...query( sql, @@ -569,10 +599,17 @@ export function detectCrashLoops(sql: SqlStorage): void { AND be.new_value = 'failed' AND be.agent_id IS NOT NULL AND be.created_at > ? + AND NOT EXISTS ( + SELECT 1 FROM ${agent_metadata} + INNER JOIN ${beads} AS hooked + ON ${agent_metadata.current_hook_bead_id} = hooked.${beads.columns.bead_id} + WHERE ${agent_metadata.bead_id} = be.agent_id + AND hooked.${beads.columns.labels} LIKE ? + ) GROUP BY be.agent_id HAVING fail_count >= ? `, - [windowCutoff, CRASH_LOOP_THRESHOLD] + [windowCutoff, TRIAGE_LABEL_ANY, CRASH_LOOP_THRESHOLD] ), ]); diff --git a/cloudflare-gastown/src/dos/town/review-queue.ts b/cloudflare-gastown/src/dos/town/review-queue.ts index f24c72a693..aca88240c2 100644 --- a/cloudflare-gastown/src/dos/town/review-queue.ts +++ b/cloudflare-gastown/src/dos/town/review-queue.ts @@ -18,6 +18,7 @@ import { getBead, closeBead, updateBeadStatus, + updateConvoyProgress, createBead, getConvoyForBead, getConvoyFeatureBranch, @@ -273,8 +274,18 @@ export function completeReviewWithResult( }); if (input.status === 'merged') { + const mergeTimestamp = now(); closeBead(sql, entry.bead_id, entry.agent_id); + // Explicitly trigger convoy progress for the source bead after the MR closes. + // closeBead → updateBeadStatus → updateConvoyProgress, but only if the source + // bead's status actually changes. If the polecat already closed the source bead + // before submitting to the review queue, the guard in updateBeadStatus short- + // circuits and updateConvoyProgress is never called. Calling it here directly + // ensures the convoy recounts after the MR bead is now closed (not in-flight), + // so the source bead passes the NOT EXISTS guard and counts toward closedCount. + updateConvoyProgress(sql, entry.bead_id, mergeTimestamp); + // If this was a convoy landing MR, also set landed_at on the convoy metadata const sourceBead = getBead(sql, entry.bead_id); if (sourceBead?.type === 'convoy') { diff --git a/cloudflare-gastown/src/middleware/auth.middleware.ts b/cloudflare-gastown/src/middleware/auth.middleware.ts index d328d26812..31559983ff 100644 --- a/cloudflare-gastown/src/middleware/auth.middleware.ts +++ b/cloudflare-gastown/src/middleware/auth.middleware.ts @@ -1,7 +1,7 @@ import type { Context } from 'hono'; import { createMiddleware } from 'hono/factory'; import { extractBearerToken } from '@kilocode/worker-utils'; -import { verifyAgentJWT, type AgentJWTPayload } from '../util/jwt.util'; +import { verifyAgentJWT, verifyContainerJWT, type AgentJWTPayload } from '../util/jwt.util'; import { resError } from '../util/res.util'; import type { GastownEnv } from '../gastown.worker'; @@ -34,11 +34,41 @@ export const townIdMiddleware = createMiddleware(async (c, next) => }); /** - * Auth middleware that requires a valid Gastown agent JWT via - * `Authorization: Bearer `. + * Try to authenticate with a container-scoped JWT (scope: 'container'). + * Returns an AgentJWTPayload-shaped object if successful, null otherwise. + * Container JWTs carry { townId, userId } but not agentId/rigId — those + * come from the route params and are trusted because the JWT proves the + * request came from the right town's container. + */ +function tryContainerJWTAuth( + c: Context, + token: string, + jwtSecret: string +): AgentJWTPayload | null { + const result = verifyContainerJWT(token, jwtSecret); + if (!result.success) return null; + + // Populate agentId/rigId from route params, falling back to headers + // for routes that don't have :agentId/:rigId params (e.g. /triage/resolve, + // /mail). The container JWT proves the request came from this town's + // container, so we trust both the URL and the identity headers. + return { + agentId: c.req.param('agentId') ?? c.req.header('X-Gastown-Agent-Id') ?? '', + rigId: c.req.param('rigId') ?? c.req.header('X-Gastown-Rig-Id') ?? '', + townId: result.payload.townId, + userId: result.payload.userId, + }; +} + +/** + * Auth middleware that accepts either: + * 1. A container-scoped JWT (scope: 'container') — preferred for container→worker calls + * 2. A legacy per-agent JWT (HS256, 8h expiry) — retained for backwards compatibility * - * Sets `agentJWT` on the Hono context. Also validates the JWT's townId - * and rigId match the route params to prevent cross-town/cross-rig access. + * Sets `agentJWT` on the Hono context. Validates: + * - townId always (cross-town guard) + * - rigId only for legacy agent JWTs (container JWTs are town-scoped; + * the container is trusted to call correct rig endpoints) */ export const authMiddleware = createMiddleware(async (c, next) => { const token = extractBearerToken(c.req.header('Authorization')); @@ -52,31 +82,48 @@ export const authMiddleware = createMiddleware(async (c, next) => { return c.json(resError('Internal server error'), 500); } - const result = verifyAgentJWT(token, secret); - if (!result.success) { - return c.json(resError(result.error), 401); + // Try container-scoped JWT first (scope: 'container', 8h expiry + alarm refresh) + let payload = tryContainerJWTAuth(c, token, secret); + + // Fall back to legacy JWT verification + if (!payload) { + const result = verifyAgentJWT(token, secret); + if (!result.success) { + return c.json(resError(result.error), 401); + } + payload = result.payload; } - // Verify the rigId in the JWT matches the route param + // Cross-rig guard: only enforced for legacy agent JWTs where the rigId + // is cryptographically bound to the token. Container JWTs are town-scoped + // and don't carry a rigId — the container is trusted within its town. const rigId = c.req.param('rigId'); - if (rigId && result.payload.rigId !== rigId) { + if (rigId && payload.rigId && payload.rigId !== rigId) { return c.json(resError('Token rigId does not match route'), 403); } - // Verify the townId in the JWT matches the route param (cross-town guard) + // Verify the townId matches the route param (cross-town guard) const townId = c.req.param('townId'); - if (townId && townId !== result.payload.townId) { + if (townId && townId !== payload.townId) { return c.json(resError('Cross-town access denied'), 403); } - c.set('agentJWT', result.payload); + c.set('agentJWT', payload); return next(); }); /** - * Restricts a route to the specific agent identified by the JWT. - * Validates the agentId route param matches the JWT agentId. + * Restricts a route to the specific agent identified by the auth token. + * Validates the agentId route param matches the token's agentId. * Must be applied after `authMiddleware`. + * + * For container JWTs: agentId is populated from the route param by + * tryContainerJWTAuth, so this check is a no-op (route param == route + * param). This is intentional — the container JWT is town-scoped, and + * the container is trusted to call the correct agent endpoints. + * Cross-agent attacks require compromising the container itself, which + * is the same trust boundary the container already has (it runs all + * agents in the town). */ export const agentOnlyMiddleware = createMiddleware(async (c, next) => { const jwt = c.get('agentJWT'); @@ -85,7 +132,7 @@ export const agentOnlyMiddleware = createMiddleware(async (c, next) } const agentId = c.req.param('agentId'); - if (agentId && jwt.agentId !== agentId) { + if (agentId && jwt.agentId && jwt.agentId !== agentId) { return c.json(resError('Token agentId does not match route'), 403); } diff --git a/cloudflare-gastown/src/middleware/mayor-auth.middleware.ts b/cloudflare-gastown/src/middleware/mayor-auth.middleware.ts index ed127c09b2..d104e3050d 100644 --- a/cloudflare-gastown/src/middleware/mayor-auth.middleware.ts +++ b/cloudflare-gastown/src/middleware/mayor-auth.middleware.ts @@ -1,16 +1,18 @@ import { createMiddleware } from 'hono/factory'; -import { verifyAgentJWT } from '../util/jwt.util'; +import { verifyAgentJWT, verifyContainerJWT } from '../util/jwt.util'; import { resError } from '../util/res.util'; import type { GastownEnv } from '../gastown.worker'; import { extractBearerToken } from '@kilocode/worker-utils'; import { resolveSecret } from '../util/secret.util'; /** - * Auth middleware for mayor tool routes. Validates a Gastown agent JWT - * and checks that the JWT's `townId` matches the `:townId` route param. + * Auth middleware for mayor tool routes. Accepts either: + * 1. A container secret (HMAC-based, no expiry) — preferred + * 2. A legacy agent JWT (HS256, 8h expiry) — backwards compatibility * - * Unlike the rig-scoped `authMiddleware` (which checks `rigId` match), - * this validates `townId` — the mayor operates cross-rig. + * Validates the token's `townId` matches the `:townId` route param. + * Unlike the rig-scoped `authMiddleware`, this does NOT check `rigId` + * because the mayor operates cross-rig. * * Sets `agentJWT` on the Hono context. */ @@ -26,6 +28,23 @@ export const mayorAuthMiddleware = createMiddleware(async (c, next) return c.json(resError('Internal server error'), 500); } + // Try container-scoped JWT first (scope: 'container', carries townId + userId) + const containerResult = verifyContainerJWT(token, secret); + if (containerResult.success) { + const townId = c.req.param('townId'); + if (townId && containerResult.payload.townId !== townId) { + return c.json(resError('Token townId does not match route'), 403); + } + c.set('agentJWT', { + agentId: '', + rigId: '', + townId: containerResult.payload.townId, + userId: containerResult.payload.userId, + }); + return next(); + } + + // Fall back to legacy JWT verification const result = verifyAgentJWT(token, secret); if (!result.success) { return c.json(resError(result.error), 401); diff --git a/cloudflare-gastown/src/types.ts b/cloudflare-gastown/src/types.ts index a775283462..5cd6c6d6fb 100644 --- a/cloudflare-gastown/src/types.ts +++ b/cloudflare-gastown/src/types.ts @@ -48,7 +48,7 @@ export type BeadFilter = { // -- Agents (now beads + agent_metadata) -- -export const AgentRole = z.enum(['polecat', 'refinery', 'mayor']); +export const AgentRole = z.enum(['polecat', 'refinery', 'mayor', 'triage']); export type AgentRole = z.infer; export const AgentStatus = z.enum(['idle', 'working', 'stalled', 'dead']); diff --git a/cloudflare-gastown/src/util/jwt.util.ts b/cloudflare-gastown/src/util/jwt.util.ts index b7f5970a5e..2a9110786e 100644 --- a/cloudflare-gastown/src/util/jwt.util.ts +++ b/cloudflare-gastown/src/util/jwt.util.ts @@ -1,6 +1,8 @@ import jwt from 'jsonwebtoken'; import { z } from 'zod'; +// ── Legacy per-agent JWT (deprecated — retained for rollout compat) ───── + export const AgentJWTPayload = z.object({ agentId: z.string(), rigId: z.string(), @@ -42,3 +44,58 @@ export function signAgentJWT( expiresIn: expiresInSeconds, }); } + +// ── Per-container JWT (preferred — no expiry, one per container) ───────── + +export const ContainerJWTPayload = z.object({ + townId: z.string(), + userId: z.string(), + scope: z.literal('container'), +}); + +export type ContainerJWTPayload = z.infer; + +const CONTAINER_JWT_EXPIRY_SECONDS = 8 * 3600; // 8h — same as legacy agent JWTs + +/** + * Sign a container-scoped JWT. 8h expiry, periodically refreshed by + * the TownDO alarm. Short-lived to limit damage from exfiltration, + * but refreshed proactively so running containers never hit expiry. + */ +export function signContainerJWT( + payload: { townId: string; userId: string }, + secret: string +): string { + return jwt.sign({ ...payload, scope: 'container' }, secret, { + algorithm: 'HS256', + expiresIn: CONTAINER_JWT_EXPIRY_SECONDS, + }); +} + +/** + * Verify a container-scoped JWT. Uses the standard 8h maxAge. + */ +export function verifyContainerJWT( + token: string, + secret: string +): { success: true; payload: ContainerJWTPayload } | { success: false; error: string } { + try { + const raw = jwt.verify(token, secret, { + algorithms: ['HS256'], + maxAge: '8h', + }); + const parsed = ContainerJWTPayload.safeParse(raw); + if (!parsed.success) { + return { success: false, error: 'Invalid container token payload' }; + } + return { success: true, payload: parsed.data }; + } catch (error) { + if (error instanceof jwt.TokenExpiredError) { + return { success: false, error: 'Token expired' }; + } + if (error instanceof jwt.JsonWebTokenError) { + return { success: false, error: 'Invalid token signature' }; + } + return { success: false, error: 'Token validation failed' }; + } +} diff --git a/plans/gastown-org-level-architecture.md b/plans/gastown-org-level-architecture.md deleted file mode 100644 index 2ed924d681..0000000000 --- a/plans/gastown-org-level-architecture.md +++ /dev/null @@ -1,413 +0,0 @@ -# Gastown at the Organization Level - -## Overview - -Gastown towns are currently user-scoped — one `GastownUserDO` per user, keyed by `userId`, storing that user's towns and rigs. There is no organization awareness anywhere in the gastown worker, DOs, container, or tool plugin. - -The Kilo platform already has a mature org model: org membership with roles (`owner`, `member`, `billing_manager`), shared GitHub/GitLab integrations, org-level billing with per-user daily limits, seat subscriptions, SSO, audit logs, and the mutually-exclusive ownership pattern (`owned_by_user_id` XOR `owned_by_organization_id`) used across every resource type. - -This spec defines how Gastown adopts the org model — enabling teams to share towns, pool agent resources, and coordinate work across members while leveraging the existing org infrastructure. - ---- - -## Design Principles - -1. **Org towns are the default for teams.** When a user belongs to an org, the primary workflow is creating and working in org-owned towns. Personal towns still exist for individual use. -2. **Existing org infrastructure, not new infrastructure.** Billing, integrations, roles, SSO, audit logs — all use the existing org systems. Gastown doesn't reinvent any of this. -3. **Org members share everything in a town.** All members can see all towns, all rigs, all beads, all agent conversations. Visibility is town-wide. Fine-grained per-rig permissions are a future concern. -4. **The Mayor serves the team, not one user.** An org town's Mayor is a shared resource. Any member can chat with it. The Mayor maintains context about all members' conversations. -5. **Billing is org-level.** All LLM and container costs for org towns charge against the org balance. - ---- - -## Ownership Model - -### Town ownership follows the platform pattern - -Towns adopt the same mutually-exclusive ownership used by every other Kilo resource: - -| Town type | Owner | Who can access | Billing | -| ------------ | -------------------------- | ------------------------------- | -------------- | -| Personal | `owned_by_user_id` | Only the user | User's balance | -| Organization | `owned_by_organization_id` | All org members (based on role) | Org balance | - -A town is either personal or org-owned, never both. - -### Org role → town permissions - -| Org role | Can view towns | Can create towns | Can manage towns (delete, config) | Can chat with Mayor | Can view agents/beads | -| ----------------- | ------------------------------- | ---------------- | --------------------------------- | ------------------- | --------------------- | -| `owner` | Yes | Yes | Yes | Yes | Yes | -| `member` | Yes | Yes | No | Yes | Yes | -| `billing_manager` | No (not a user of the platform) | No | No | No | No | - -This mirrors how org roles map to other resources in the platform — owners manage, members use, billing managers handle money. - -### Town creation flow - -When creating a town, the UI checks the user's context: - -- **User has no org:** Town is personal. Same as today. -- **User has one org:** Default to org-owned. Option to create a personal town instead. -- **User has multiple orgs:** Org picker before town creation. Option for personal. - -The create-town API accepts an optional `organizationId`. When present, the backend verifies org membership before creating the town. - ---- - -## Architecture Changes - -### Replace GastownUserDO with owner-keyed lookup - -The current `GastownUserDO` is keyed by `userId` and stores that user's towns. This doesn't work for org-owned towns — multiple users need access to the same set of towns. - -**New approach:** Replace the per-user DO with an **owner-keyed DO** that can be keyed by either `userId` or `orgId`: - -```typescript -function getGastownOwnerStub(env: Env, owner: { type: 'user' | 'org'; id: string }) { - const key = `${owner.type}:${owner.id}`; - return env.GASTOWN_OWNER.get(env.GASTOWN_OWNER.idFromName(key)); -} -``` - -- Personal towns: `getGastownOwnerStub(env, { type: 'user', id: userId })` -- Org towns: `getGastownOwnerStub(env, { type: 'org', id: orgId })` - -The `owner_towns` table adds an `owner_type` and `owner_id` column: - -```sql -CREATE TABLE owner_towns ( - town_id TEXT PRIMARY KEY, - name TEXT NOT NULL, - owner_type TEXT NOT NULL, -- 'user' or 'org' - owner_id TEXT NOT NULL, -- userId or orgId - created_by TEXT NOT NULL, -- userId of the creator (for audit) - created_at TEXT NOT NULL, - updated_at TEXT NOT NULL -); -``` - -### TownDO stores ownership context - -The TownDO config gains org awareness: - -```typescript -type TownConfig = { - owner_type: 'user' | 'org'; - owner_id: string; // userId or orgId - owner_user_id?: string; // set when owner_type = 'user' - organization_id?: string; // set when owner_type = 'org' - // ... existing config fields -}; -``` - -This propagates through: - -- **Container dispatch:** The container receives `organizationId` so it can resolve org-level integrations (GitHub tokens) and set appropriate env vars. -- **JWT minting:** The agent JWT payload gains `organizationId?: string` so rig-scoped tool calls carry org context. -- **Billing:** When the container makes LLM calls via the Kilo gateway, the `kilocodeToken` is minted with org context so costs charge against the org balance. - -### Route structure - -Add org-scoped routes alongside user-scoped routes: - -``` -# Personal towns (existing pattern, updated) -GET /api/users/:userId/towns -POST /api/users/:userId/towns - -# Org towns (new) -GET /api/orgs/:orgId/towns -POST /api/orgs/:orgId/towns - -# Town-level routes are the same regardless of ownership -# (townId is globally unique, no need for user/org prefix) -GET /api/towns/:townId/... -POST /api/towns/:townId/... -``` - -The town-level routes don't change — once you have a `townId`, the TownDO handles everything. The ownership context is already stored in the TownDO's config. - -### Auth middleware - -The gastown worker currently relies on CF Access as its only perimeter. For org support, add proper authorization: - -```typescript -// For /api/orgs/:orgId/towns/* routes -async function orgMiddleware(c: Context, next: Next) { - const orgId = c.req.param('orgId'); - const userId = getUserIdFromRequest(c); // from CF Access JWT or session - - // Verify org membership via the main Kilo API - const membership = await verifyOrgMembership(c.env, orgId, userId); - if (!membership) return c.json({ error: 'Not an org member' }, 403); - - c.set('orgId', orgId); - c.set('orgRole', membership.role); - c.set('userId', userId); - await next(); -} - -// For /api/towns/:townId/* routes -async function townAuthMiddleware(c: Context, next: Next) { - const townId = c.req.param('townId'); - const userId = getUserIdFromRequest(c); - - // Look up town ownership from TownDO config - const townDO = getTownDOStub(c.env, townId); - const config = await townDO.getConfig(); - - if (config.owner_type === 'user') { - if (config.owner_id !== userId) return c.json({ error: 'Forbidden' }, 403); - } else { - // Org-owned: verify caller is an org member - const membership = await verifyOrgMembership(c.env, config.organization_id!, userId); - if (!membership) return c.json({ error: 'Not an org member' }, 403); - } - - await next(); -} -``` - ---- - -## Shared Mayor - -In an org town, the Mayor is a shared resource. Multiple team members can chat with it concurrently or sequentially. - -### How it works - -The Mayor maintains a single persistent session per town (same as today). When any org member sends a message, it goes to the same Mayor session. The Mayor's conversation history includes messages from all members. - -Each message carries the sender's identity: - -```typescript -// When forwarding a user message to the Mayor's session -const systemContext = `[Message from ${userName} (${userRole})]`; -``` - -The Mayor can see who's talking to it and tailor responses accordingly. "Sarah asked me to refactor the auth module yesterday. You're asking about the auth module too — are you coordinating with her, or is this separate work?" - -### Mayor chat in the dashboard - -The town dashboard's Mayor chat panel shows the conversation to all connected members. Messages are attributed to their senders. This is a shared chat room where the Mayor is the AI participant and team members are the human participants. - -Implementation: The existing Mayor WebSocket stream (town-wide, multiplexed) already supports multiple connected clients. Each client sends messages with the user's identity. The Mayor's responses are broadcast to all connected clients. - -### Concurrency - -When two members send messages simultaneously, they're queued by the TownDO (DO RPC serialization guarantees single-writer). The Mayor processes them sequentially. The second message includes context from the first — the Mayor sees the full conversation, not isolated threads. - -If the team wants isolated conversations with the Mayor (e.g., a private question about performance), that's a future feature (per-user Mayor threads within an org town). For now, all Mayor interaction is shared. - ---- - -## Integrations - -### Org GitHub/GitLab apps are used automatically - -When creating a rig in an org-owned town, the repo picker shows repositories from the **org's GitHub/GitLab installations** (not the user's personal installations). This uses the existing `getIntegrationForOwner({ type: 'org', id: orgId }, 'github')` infrastructure. - -The flow: - -1. User clicks "Add Rig" in an org town -2. Backend calls `getIntegrationForOwner({ type: 'org', id: orgId }, 'github')` -3. Repo picker shows org-accessible repos -4. On rig creation, `platform_integration_id` on the rig references the org's integration -5. When the container needs a git token, it's minted from the org's GitHub App installation - -If the org doesn't have a GitHub App installed, the "Add Rig" flow prompts the user to install it (requires org `owner` role). - ---- - -## Billing - -### Org towns charge the org - -All LLM costs for agents in org-owned towns charge against the org balance. This uses the existing `getBalanceForOrganizationUser(orgId, userId)` infrastructure: - -1. When the TownDO dispatches an agent, it mints a `kilocodeToken` scoped to the org -2. The container's kilo serve instances route LLM calls through the Kilo gateway with this token -3. The gateway charges usage to the org's `microdollars_used` - -### Container costs - -Cloudflare Container costs are per-town. For org towns, these costs are attributed to the org. Metering uses the existing `microdollar_usage` table with `organization_id` set. - ---- - -## Cross-Member Visibility - -### Dashboard shows everything - -When any org member opens an org town's dashboard, they see the complete picture: - -- All rigs, all beads, all agents, all convoys -- All members' Mayor chat history -- All agent conversation streams -- All merge queue entries and their outcomes -- Activity feed across all members' actions - -Attribution is clear — every bead shows who created it, every convoy shows who initiated it, every Mayor message shows who sent it. The dashboard answers "what is happening across the entire team's agent fleet?" - -### Notifications - -When an event occurs in an org town (convoy lands, escalation raised, merge failed), all connected dashboard clients receive the event via the existing WebSocket stream. Targeted notifications (e.g., "your convoy landed") use the `created_by` field on beads to identify the relevant member. - -Future: Slack integration for org towns. Gastown events post to an org's Slack channel via the existing `organization-slack-router` infrastructure. "Convoy cv-abc landed: 5/5 beads merged across 2 rigs. Total cost: $23.40." - ---- - -## Audit Trail - -### Org audit logs include Gastown events - -The existing `organization_audit_logs` table gains new action types for Gastown events: - -| Action | Details | -| ----------------------------- | -------------------------------- | -| `gastown.town.create` | Member created a town | -| `gastown.town.delete` | Owner deleted a town | -| `gastown.town.config_change` | Owner changed town config | -| `gastown.rig.create` | Member added a rig | -| `gastown.rig.delete` | Owner removed a rig | -| `gastown.convoy.create` | Member/Mayor initiated a convoy | -| `gastown.convoy.landed` | Convoy completed | -| `gastown.escalation.critical` | Critical escalation raised | -| `gastown.escalation.resolved` | Escalation acknowledged/resolved | - -These are written by the gastown worker when handling org-town events, via a service binding to the main Kilo API (or direct Postgres write if the gastown worker has DB access). - ---- - -## Org-Level Fleet View - -### The "all towns" dashboard - -Beyond individual town dashboards, org owners get an aggregate view across all their org's towns: - -**`/gastown/org/[orgId]`** shows: - -- **Town cards** — one per town, showing: name, active agent count, open bead count, today's spend, latest activity -- **Aggregate metrics** — total spend (today/this week/this month), total beads closed, total convoys landed, active agent count across all towns -- **Cost breakdown** — per-town, per-rig, per-model cost attribution -- **Performance comparison** — which towns/rigs have high first-pass merge rates, which have high rework rates -- **Active escalations** — all unacknowledged escalations across all towns, surfaced at the top - -This view is read-only for members and actionable for owners (click into any town, adjust config, kill runaway agents). - -### Cross-town convoys - -A convoy can track beads across multiple towns. This is natural because convoys are beads in the TownDO — but cross-town convoys require a coordination layer: - -1. The initiating town creates a convoy bead -2. For beads in other towns, the convoy uses `bead_dependencies` with HOP-style references: `{ depends_on: "town:other-town-id:bead-id", type: "tracks" }` -3. When a tracked bead in another town closes, that town's alarm notifies the initiating town (via a cross-town webhook or direct DO RPC if both towns are in the same org's gastown worker) -4. The initiating town updates convoy progress - -This extends the local Gastown convoy model to multi-town scope, which local Gastown doesn't support (convoys are per-town, tracking beads across rigs within one town). - ---- - -## Agent Identity at the Org Level - -### Agents are town-scoped, but CVs aggregate at the org level - -Within a town, agent identities are town-scoped (per #441). But across towns in the same org, agent performance data can be aggregated: - -- "Polecats using Claude Opus across all our towns have a 91% first-pass merge rate" -- "The payments-town has 3x the rework rate of the platform-town — something is wrong with the repo or the prompts" -- "Agent Toast in frontend-town has completed 47 beads with $0.83 average cost" - -This data lives in the TownDO (per-town agent beads and bead events). The org fleet view aggregates across TownDOs via the gastown worker. - -### Shared agent configurations - -Org owners can define agent configurations at the org level: - -```typescript -type OrgAgentConfig = { - default_model: string; - polecat_system_prompt_override?: string; - refinery_quality_gates?: string[]; - max_polecats_per_rig?: number; -}; -``` - -These serve as defaults for all towns in the org. Individual towns can override. This prevents the "every town is configured differently" problem and lets the org standardize on configurations that produce good results. - ---- - -## SSO and Auto-Provisioning - -When an org has SSO configured (via WorkOS), new team members who authenticate via SSO are auto-provisioned into the org. They immediately see all org-owned Gastown towns in their dashboard — no manual invitation or town sharing needed. - -The flow: - -1. New engineer joins company, authenticates via company SSO -2. WorkOS auto-provisions them into the Kilo org (existing behavior) -3. They navigate to Gastown, see all org towns -4. They open a town, chat with the Mayor, watch agents work - -Zero configuration for the new member. The org's Gastown infrastructure is immediately accessible. - ---- - -## Implementation Phases - -### Phase 1: Ownership and access control - -- Replace `GastownUserDO` with owner-keyed `GastownOwnerDO` -- Add `owner_type`/`owner_id` to town tables and TownDO config -- Add `organizationId` to agent JWT payload -- Add org auth middleware to gastown worker routes -- Add org-scoped routes (`/api/orgs/:orgId/towns`) -- Wire org membership verification - -### Phase 2: Billing integration - -- Mint org-scoped `kilocodeToken` for org town agents -- Route LLM costs to org balance via existing infrastructure -- Container cost attribution to org via `microdollar_usage` table - -### Phase 3: Shared Mayor and dashboard - -- Multi-user Mayor chat (message attribution, shared conversation) -- Dashboard access for all org members -- Activity feed shows member attribution - -### Phase 4: Org fleet view - -- Aggregate dashboard across all org towns -- Cost breakdown per town/rig/model -- Performance comparison metrics -- Cross-town escalation surfacing - -### Phase 5: Org-level configuration - -- Org-level agent config defaults (model, prompts, quality gates) -- Town-level overrides -- Shared formula library per org - -### Phase 6: Cross-town convoys - -- Cross-town bead references -- Cross-town convoy tracking and landing detection -- Cross-town notification routing - -### Phase 7: Audit and compliance - -- Gastown event types in org audit logs -- Org-level usage reporting -- Export capabilities for compliance - ---- - -## What This Enables (That Local Gastown Can't Do) - -1. **Team coordination** — Multiple engineers share a Mayor that knows what everyone is working on. "Don't touch the auth module, Sarah's convoy is refactoring it" happens naturally. -2. **Centralized cost visibility** — One dashboard showing total Gastown spend across all teams. -3. **Zero-config onboarding** — New engineer authenticates via SSO, immediately sees all org towns and can start using them. -4. **Org-wide performance data** — "Which model works best for our TypeScript repos?" answered from real production data across all teams. -5. **Cross-town project tracking** — A convoy that spans the frontend town, backend town, and infra town, with unified progress tracking and landing detection. -6. **Shared institutional knowledge** — Agent formulas, quality gate configs, and prompt tuning that work well for the org are shared across all towns, not siloed per developer.