From 2099c192bb34606b50e4e5286e71c3bb83d4c095 Mon Sep 17 00:00:00 2001 From: zbigniew sobiecki Date: Sat, 25 Apr 2026 13:26:24 +0200 Subject: [PATCH] fix(router): scope orphan-cleanup to this instance's own workers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When two cascade-router instances share a host (prod ↔ dev, multi-replica deployments, two local-dev sandboxes), each one's periodic orphan-cleanup scan would silently `docker stop` the OTHER instance's healthy in-flight workers at the 30-min `workerTimeoutMs` mark — surfacing downstream as `exit 137 · OOMKilled=false` agent runs that everyone blamed on memory. ## The bug `scanAndCleanupOrphans` filtered Docker by a single label (`cascade.managed=true`) and checked the resulting containers against THIS instance's in-process `activeWorkers` map. Sibling instances each have their own independent map. So: cascade-router spawns + tracks worker container 9fc5fb3b… cascade-router-dev scans, sees 9fc5fb3b with `cascade.managed=true`, NOT in dev's empty activeWorkers, age > 30 min, → calls `container.stop({ t: 15 })` → SIGTERM, 15 s grace, then Docker SIGKILL → exit 137 with `OOMKilled=false` Symmetric in the other direction. The 5-min scan grid + 30-min age threshold is what produced the highly-consistent 30–34 min runtime across every recent ucho exit-137 run. The smoking gun was visible only in cascade-router-DEV's log: 2026-04-25 11:01:43 WARN [cascade-router-dev] [WorkerManager] Stopped and removed orphaned container: { containerId: '9fc5fb3b7340', ... } …paired with the prod cascade-router log showing zero kill events for that container, because the prod instance's tracking was fine. ## The fix Tag every spawned container with the spawning router's instance id (`cascade.router.instance` Docker label). Scope the periodic scan's `docker.listContainers({ filters })` call to ONLY containers carrying THIS instance's id. The dev instance now never even *sees* prod's containers in its scan — the filter is server-side, not client-side, so there's no risk of a future bug in client-side checks leaking through. Instance id resolution (`src/router/instance-id.ts`): 1. `process.env.CASCADE_ROUTER_INSTANCE` (trimmed) — explicit override for hosts that share a hostname. 2. `os.hostname()` — Docker injects the container's short id here by default, so each container of cascade-router gets a unique value automatically. This is the normal path. Throws fail-loud if both resolve empty (defensive). ## Files - `src/router/instance-id.ts` (new) — pure resolver + module-load-memoised `ROUTER_INSTANCE_ID` constant. - `src/router/container-manager.ts` — adds the new label at `docker.createContainer({ Labels: ... })`. - `src/router/orphan-cleanup.ts` — adds the second clause to the `docker.listContainers({ filters: { label: [...] } })` call; includes `instanceId` in the kill-confirmation warn log so post-mortems unambiguously identify which instance acted. - `src/router/index.ts` — logs `instanceId` at router startup. ## Tests (TDD-first) - `tests/unit/router/instance-id.test.ts` (new, 7 cases) — env override wins, fallback to hostname, env-trimming, defensive throws on empty/whitespace-only inputs. - `tests/unit/router/orphan-cleanup.test.ts` — extended the `lists containers with cascade.managed=true label` test to assert the new instance-scoping label is in the filter; new test pins the safety property explicitly with a comment explaining the historical context. - `tests/unit/router/container-manager.test.ts` — new case asserts spawn passes `cascade.router.instance` in the Labels block. All 1438/1438 unit-api tests pass. `npm run typecheck` clean. `npx biome check .` clean (1228 files). ## Operator note for deploy Pre-fix containers still running at deploy time carry no `cascade.router.instance` label. Neither router will see them in its scan filter post-deploy — they become invisible orphans (harmless: they exit naturally; only snapshot-enabled ones without AutoRemove leak). One-shot manual sweep if needed: docker ps --filter label=cascade.managed=true --format \ '{{.ID}} {{.RunningFor}}' \ | awk '/(hour|day)/ {print $1}' \ | xargs -r docker stop -t 15 Co-Authored-By: Claude Opus 4.7 (1M context) --- src/router/container-manager.ts | 6 +++ src/router/index.ts | 3 +- src/router/instance-id.ts | 43 +++++++++++++++++++ src/router/orphan-cleanup.ts | 25 +++++++++-- tests/unit/router/container-manager.test.ts | 23 +++++++++++ tests/unit/router/instance-id.test.ts | 46 +++++++++++++++++++++ tests/unit/router/orphan-cleanup.test.ts | 25 +++++++++++ 7 files changed, 166 insertions(+), 5 deletions(-) create mode 100644 src/router/instance-id.ts create mode 100644 tests/unit/router/instance-id.test.ts diff --git a/src/router/container-manager.ts b/src/router/container-manager.ts index 7ccd2636..c8be047b 100644 --- a/src/router/container-manager.ts +++ b/src/router/container-manager.ts @@ -19,6 +19,7 @@ import { logger } from '../utils/logging.js'; import { activeWorkers, cleanupWorker } from './active-workers.js'; import { clearAllAgentTypeLocks } from './agent-type-lock.js'; import { loadProjectConfig, routerConfig } from './config.js'; +import { ROUTER_INSTANCE_ID } from './instance-id.js'; import { notifyTimeout } from './notifications.js'; import { stopOrphanCleanup } from './orphan-cleanup.js'; import type { CascadeJob } from './queue.js'; @@ -386,6 +387,11 @@ async function createAndMonitorContainer( 'cascade.job.id': jobId, 'cascade.job.type': job.data.type, 'cascade.managed': 'true', + // Pinning the spawning router's instance id stops sibling + // cascade-router instances on the same host from claiming + // each other's containers as orphans — see `instance-id.ts` + // and `orphan-cleanup.ts:scanAndCleanupOrphans`. + 'cascade.router.instance': ROUTER_INSTANCE_ID, 'cascade.project.id': projectId ?? '', 'cascade.agent.type': agentType ?? '', 'cascade.snapshot.enabled': snapshotEnabled ? 'true' : 'false', diff --git a/src/router/index.ts b/src/router/index.ts index edf30bf8..4f05f05e 100644 --- a/src/router/index.ts +++ b/src/router/index.ts @@ -29,6 +29,7 @@ import { LinearRouterAdapter } from './adapters/linear.js'; import { SentryRouterAdapter } from './adapters/sentry.js'; import { TrelloRouterAdapter } from './adapters/trello.js'; import { startCancelListener, stopCancelListener } from './cancel-listener.js'; +import { ROUTER_INSTANCE_ID } from './instance-id.js'; import { getQueueStats } from './queue.js'; import { processRouterWebhook } from './webhook-processor.js'; import { @@ -242,7 +243,7 @@ async function startRouter(): Promise { await startCancelListener(); startWorkerProcessor(); - logger.info('Starting router', { port }); + logger.info('Starting router', { port, instanceId: ROUTER_INSTANCE_ID }); serve({ fetch: app.fetch, port }); } diff --git a/src/router/instance-id.ts b/src/router/instance-id.ts new file mode 100644 index 00000000..02bc69d2 --- /dev/null +++ b/src/router/instance-id.ts @@ -0,0 +1,43 @@ +/** + * Stable identifier for this cascade-router process. Used to: + * + * - **Tag spawned worker containers** with a `cascade.router.instance` + * Docker label (see `container-manager.ts`). + * - **Scope the periodic orphan-cleanup scan** to containers carrying + * THIS instance's id (see `orphan-cleanup.ts`). + * + * Without this, two cascade-router instances on the same host (prod + + * dev, two local-dev sandboxes, k8s replicas pre-this-change) would + * each treat the other's worker containers as orphans and silently + * `docker stop` them at the 30-min `workerTimeoutMs` mark — surfacing + * downstream as `exit 137 · OOMKilled=false` agent runs that everyone + * blamed on memory. + * + * Resolution order: + * 1. `process.env.CASCADE_ROUTER_INSTANCE` (trimmed) — explicit + * override for the rare case where two instances share a hostname + * (e.g. local docker with `--network host`). + * 2. `os.hostname()` — Docker injects the container's short id here + * by default, which is per-container unique and stable across the + * cascade-router process's lifetime. This is the normal path. + * + * Memoised at module load. The pure resolver is exported for direct + * unit testing. + */ +import os from 'node:os'; + +export function resolveRouterInstanceId( + env: NodeJS.ProcessEnv | Record, + hostname: string, +): string { + const fromEnv = env.CASCADE_ROUTER_INSTANCE?.trim(); + if (fromEnv) return fromEnv; + const fromHost = hostname.trim(); + if (fromHost) return fromHost; + throw new Error( + 'Cannot resolve router instance id: both CASCADE_ROUTER_INSTANCE and os.hostname() are empty. ' + + 'Set CASCADE_ROUTER_INSTANCE explicitly to disambiguate this cascade-router instance.', + ); +} + +export const ROUTER_INSTANCE_ID = resolveRouterInstanceId(process.env, os.hostname()); diff --git a/src/router/orphan-cleanup.ts b/src/router/orphan-cleanup.ts index 98a22bd8..5ce44524 100644 --- a/src/router/orphan-cleanup.ts +++ b/src/router/orphan-cleanup.ts @@ -1,8 +1,15 @@ /** * Orphaned container cleanup for CASCADE worker processes. * - * Self-contained periodic task that scans for containers with cascade.managed=true - * that are not tracked in the activeWorkers map and are older than workerTimeoutMs. + * Self-contained periodic task that scans for containers with + * `cascade.managed=true` AND `cascade.router.instance=` + * that are not tracked in the activeWorkers map and are older than + * workerTimeoutMs. + * + * The `cascade.router.instance` clause is the safety belt that stops + * sibling cascade-router instances on the same host (prod ↔ dev, + * multi-replica deployments) from killing each other's healthy + * workers — see `instance-id.ts` for the resolver. */ import Docker from 'dockerode'; @@ -11,6 +18,7 @@ import { captureException } from '../sentry.js'; import { logger } from '../utils/logging.js'; import { getTrackedContainerIds } from './active-workers.js'; import { routerConfig } from './config.js'; +import { ROUTER_INSTANCE_ID } from './instance-id.js'; const docker = new Docker(); @@ -44,7 +52,9 @@ export function startOrphanCleanup(): void { }); }, ORPHAN_SCAN_INTERVAL_MS); - logger.info('[WorkerManager] Started orphan cleanup scan (every 5 minutes)'); + logger.info('[WorkerManager] Started orphan cleanup scan (every 5 minutes)', { + instanceId: ROUTER_INSTANCE_ID, + }); } /** @@ -72,7 +82,13 @@ export async function scanAndCleanupOrphans(): Promise { const containers = await docker.listContainers({ all: false, // Only running containers filters: { - label: ['cascade.managed=true'], + // Two-clause filter: `cascade.managed=true` keeps the cross- + // instance debugging affordance (`docker ps -a --filter + // label=cascade.managed=true` finds every cascade worker + // regardless of who spawned it), and the per-instance label + // is the safety belt that stops sibling routers from + // claiming each other's containers as orphans. + label: ['cascade.managed=true', `cascade.router.instance=${ROUTER_INSTANCE_ID}`], }, }); @@ -115,6 +131,7 @@ export async function scanAndCleanupOrphans(): Promise { logger.warn('[WorkerManager] Stopped and removed orphaned container:', { containerId: containerId.slice(0, 12), ageMinutes, + instanceId: ROUTER_INSTANCE_ID, }); // Update DB run status (fire-and-forget). Containers created before this diff --git a/tests/unit/router/container-manager.test.ts b/tests/unit/router/container-manager.test.ts index 2ced742b..79840f64 100644 --- a/tests/unit/router/container-manager.test.ts +++ b/tests/unit/router/container-manager.test.ts @@ -339,6 +339,29 @@ describe('spawnWorker', () => { resolveWait(); }); + it('includes cascade.router.instance label so sibling routers do not orphan-kill this container', async () => { + const { resolveWait } = setupMockContainer(); + + await spawnWorker( + makeJob({ + id: 'job-instance', + data: { type: 'trello', projectId: 'proj-1' } as CascadeJob, + }) as never, + ); + + const createCall = mockDockerCreateContainer.mock.calls[0]?.[0] as { + Labels: Record; + }; + const instanceLabel = createCall.Labels['cascade.router.instance']; + // The exact value depends on os.hostname() / the env override; pin + // "non-empty string" — it's the mere presence + uniqueness that + // makes orphan-cleanup's per-instance filter work. + expect(instanceLabel).toBeTruthy(); + expect(typeof instanceLabel).toBe('string'); + + resolveWait(); + }); + it('uses project watchdogTimeoutMs + 2min buffer when available', async () => { mockLoadProjectConfig.mockResolvedValue({ projects: [], diff --git a/tests/unit/router/instance-id.test.ts b/tests/unit/router/instance-id.test.ts new file mode 100644 index 00000000..d10feddd --- /dev/null +++ b/tests/unit/router/instance-id.test.ts @@ -0,0 +1,46 @@ +import { describe, expect, it } from 'vitest'; +import { resolveRouterInstanceId } from '../../../src/router/instance-id.js'; + +describe('resolveRouterInstanceId', () => { + it('returns process.env.CASCADE_ROUTER_INSTANCE when set non-empty', () => { + expect( + resolveRouterInstanceId({ CASCADE_ROUTER_INSTANCE: 'cascade-router-prod' }, 'fallback-host'), + ).toBe('cascade-router-prod'); + }); + + it('falls back to hostname when env is undefined', () => { + expect(resolveRouterInstanceId({}, 'bauer-12345')).toBe('bauer-12345'); + }); + + it('falls back to hostname when env is empty string', () => { + expect(resolveRouterInstanceId({ CASCADE_ROUTER_INSTANCE: '' }, 'bauer-12345')).toBe( + 'bauer-12345', + ); + }); + + it('falls back to hostname when env is whitespace-only', () => { + expect(resolveRouterInstanceId({ CASCADE_ROUTER_INSTANCE: ' ' }, 'bauer-12345')).toBe( + 'bauer-12345', + ); + }); + + it('trims whitespace from the env value when honoring it', () => { + expect( + resolveRouterInstanceId( + { CASCADE_ROUTER_INSTANCE: ' cascade-router-staging ' }, + 'fallback-host', + ), + ).toBe('cascade-router-staging'); + }); + + it('throws when both env and hostname are empty (defensive)', () => { + // Should never happen in practice — os.hostname() always returns + // something — but a fail-loud guard is cheap and prevents a silent + // mis-tagging if a future runtime ever returns '' from hostname(). + expect(() => resolveRouterInstanceId({}, '')).toThrow(/Cannot resolve router instance id/i); + }); + + it('rejects whitespace-only hostname', () => { + expect(() => resolveRouterInstanceId({}, ' ')).toThrow(/Cannot resolve router instance id/i); + }); +}); diff --git a/tests/unit/router/orphan-cleanup.test.ts b/tests/unit/router/orphan-cleanup.test.ts index e5b3d0bc..4ef3fa87 100644 --- a/tests/unit/router/orphan-cleanup.test.ts +++ b/tests/unit/router/orphan-cleanup.test.ts @@ -125,6 +125,31 @@ describe('orphan-cleanup', () => { ); }); + it('scopes the scan to THIS instance via cascade.router.instance label', async () => { + // Critical safety property: when two cascade-router instances run on + // the same host (prod + dev, k8s replicas, two local sandboxes), + // each must only see its OWN spawned containers. Without this the + // dev instance silently `docker stop`s the prod instance's workers + // at the 30-min `workerTimeoutMs` mark — surfacing downstream as + // `exit 137 · OOMKilled=false` runs that masquerade as OOM. + mockDockerListContainers.mockResolvedValue([]); + + await scanAndCleanupOrphans(); + + const firstCallArg = mockDockerListContainers.mock.calls[0]?.[0] as { + filters: { label: string[] }; + }; + const labelFilters = firstCallArg.filters.label; + expect(labelFilters).toContain('cascade.managed=true'); + // The instance label must be present and non-empty. Pinning the + // exact value would tie the test to whatever hostname the test + // environment happens to expose; pinning the prefix is what + // matters for the safety property. + const instanceLabel = labelFilters.find((l) => l.startsWith('cascade.router.instance=')); + expect(instanceLabel).toBeDefined(); + expect(instanceLabel?.split('=')[1]).toBeTruthy(); + }); + it('skips tracked containers', async () => { const trackedContainerId = 'container-abc123def456'; mockTrackedIds.add(trackedContainerId);