Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions AGENTS.md
2 changes: 2 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,8 @@ Some triggers take params (e.g. `review` + `scm:check-suite-success` accepts `{"

**Post-completion review dispatch** — when an implementation agent succeeds with a PR, the execution pipeline checks CI status and fires the review agent deterministically (before the container exits). This guarantees review dispatch within seconds of implementation completion, regardless of GitHub webhook timing. Uses the same `claimReviewDispatch` dedup key as the `check-suite-success` trigger, so the two paths cannot double-enqueue.

**Worker exit diagnostics** — when a worker container exits non-zero, the router calls `container.inspect()` *before* AutoRemove reaps it and stamps the run record's `error` field with a structured, grep-stable string: `Worker crashed with exit code N · OOMKilled=<true|false> · reason="<State.Error>"`. The `OOMKilled=true` marker is the definitive cgroup-OOM signal (per Docker's own `State.OOMKilled`); a 137 exit *without* `OOMKilled=true` means the kill came from inside the container or from a non-cgroup signal — *not* memory. The `[WorkerManager] Resolved spawn settings` log emitted at every spawn includes both `projectWatchdogTimeoutMs` and `globalWorkerTimeoutMs` so post-mortems can confirm whether the per-project override actually won. See `src/router/active-workers.ts:formatCrashReason` for the format and `tests/unit/router/container-manager-diagnostics.test.ts` for regression pins.

## Review agent — context shape (debugging)

Review agent receives a **compact per-file diff context**, not full file contents. Each changed file is a `### <file> (<status>, +N -M)` section with a unified diff hunk. Budget: `REVIEW_DIFF_CONTEXT_TOKEN_LIMIT` = 200k tokens, per-file cap 10%.
Expand Down
39 changes: 30 additions & 9 deletions src/router/active-workers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,32 @@ export interface ActiveWorker {
agentType?: string;
}

/**
* Diagnostic facts about a worker exit, surfaced into the run record's `error`
* field so post-mortem investigations can answer "was this OOM?", "was it
* killed by Docker?" without ssh + syslog access. Sourced from
* `dockerode container.inspect()` after `wait()`.
*/
export interface ExitDetails {
/** `State.OOMKilled` from Docker — definitive cgroup-OOM signal. */
oomKilled?: boolean;
/** `State.Error` from Docker — non-empty when the runtime aborted the container. */
exitReason?: string;
}

/**
* Format a worker-crash reason string with whatever diagnostic facts we have.
* Stable, grep-friendly format: `Worker crashed with exit code N · OOMKilled=… · reason="…"`.
* Empty / undefined fields are omitted.
*/
export function formatCrashReason(exitCode: number, details?: ExitDetails): string {
const parts: string[] = [`Worker crashed with exit code ${exitCode}`];
if (details?.oomKilled === true) parts.push('OOMKilled=true');
else if (details?.oomKilled === false) parts.push('OOMKilled=false');
if (details?.exitReason) parts.push(`reason="${details.exitReason}"`);
return parts.join(' · ');
}

export const activeWorkers = new Map<string, ActiveWorker>();

/**
Expand All @@ -50,7 +76,7 @@ export function getActiveWorkers(): Array<{ jobId: string; startedAt: Date }> {
* The timeout path (killWorker) handles its own 'timed_out' DB update and calls
* cleanupWorker without an exitCode so this block is skipped.
*/
export function cleanupWorker(jobId: string, exitCode?: number): void {
export function cleanupWorker(jobId: string, exitCode?: number, details?: ExitDetails): void {
const worker = activeWorkers.get(jobId);
if (worker) {
clearTimeout(worker.timeoutHandle);
Expand All @@ -62,20 +88,15 @@ export function cleanupWorker(jobId: string, exitCode?: number): void {
}
if (exitCode !== undefined && exitCode !== 0 && worker.projectId) {
const durationMs = Date.now() - worker.startedAt.getTime();
const reason = formatCrashReason(exitCode, details);
const updatePromise = worker.workItemId
? failOrphanedRun(
worker.projectId,
worker.workItemId,
`Worker crashed with exit code ${exitCode}`,
'failed',
durationMs,
)
? failOrphanedRun(worker.projectId, worker.workItemId, reason, 'failed', durationMs)
: failOrphanedRunFallback(
worker.projectId,
worker.agentType,
worker.startedAt,
'failed',
`Worker crashed with exit code ${exitCode}`,
reason,
durationMs,
);
updatePromise
Expand Down
189 changes: 142 additions & 47 deletions src/router/container-manager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,123 @@ async function removeContainer(containerId: string): Promise<void> {
}
}

/**
* Inspect a just-exited container and pull the diagnostic facts that explain
* its exit. `OOMKilled` and `State.Error` are only available before AutoRemove
* (or our manual `removeContainer`) reaps the container, so this MUST run
* before either path. Returns nullable fields when inspection fails — never
* throws.
*
* @internal Exported for unit testing only — call `onWorkerExit` from app code.
*/
export async function inspectExitedContainer(
container: ReturnType<typeof docker.getContainer>,
jobId: string,
): Promise<{ oomKilled?: boolean; exitReason?: string; durationMs?: number }> {
// Wrap in try/catch — `inspect()` can fail post-exit if the daemon socket
// drops, the container is reaped between `wait()` and here, or the API
// times out. Diagnostics are best-effort; falling back to undefined is
// safer than failing the whole post-exit pipeline.
let inspectResult: Awaited<ReturnType<typeof container.inspect>> | null = null;
try {
inspectResult = await container.inspect();
} catch (err) {
logger.warn('[WorkerManager] container.inspect() after wait failed:', {
jobId,
error: String(err),
});
}
const state = inspectResult?.State;
const oomKilled = state?.OOMKilled;
// Empty `State.Error` (the common case for clean exits) → undefined so the
// run-record reason string omits the `reason="…"` segment entirely.
const exitReason = state?.Error?.length ? state.Error : undefined;
const startedAtIso = state?.StartedAt;
const finishedAtIso = state?.FinishedAt;
// Docker can report sentinel timestamps (e.g. `0001-01-01T00:00:00Z` for a
// container that never fully started) that parse to NaN — drop those so
// downstream logs/Sentry don't ship `durationMs: NaN`.
const rawDurationMs =
startedAtIso && finishedAtIso
? new Date(finishedAtIso).getTime() - new Date(startedAtIso).getTime()
: undefined;
const durationMs =
rawDurationMs !== undefined && Number.isFinite(rawDurationMs) && rawDurationMs >= 0
? rawDurationMs
: undefined;
return { oomKilled, exitReason, durationMs };
}

/**
* Tail-log the worker's stdout/stderr for at-a-glance debugging. Full
* per-worker logs are also indexed in Loki via promtail's per-container label
* (`{container="/cascade-worker-${jobId}"}`); this 50-line tail is a
* convenience and is not load-bearing.
*/
async function logWorkerTail(container: ReturnType<typeof docker.getContainer>): Promise<void> {
try {
const logs = await container.logs({ stdout: true, stderr: true, follow: false });
const logText = logs.toString('utf-8');
if (!logText.trim()) return;
const lines = logText.trim().split('\n');
const tail = lines.slice(-50).join('\n');
logger.info(
`[WorkerManager] Worker logs (last ${Math.min(lines.length, 50)} of ${lines.length} lines):\n${tail}`,
);
} catch {
// Container may already be removed — expected with AutoRemove
}
}

/**
* Post-exit handler for a worker container: pulls Docker diagnostics, tail-logs
* stdout, decides snapshot commit vs. skip, and runs cleanup. Extracted from
* the `wait()` callback to keep its cyclomatic complexity within the lint budget.
*/
async function onWorkerExit(opts: {
container: ReturnType<typeof docker.getContainer>;
result: { StatusCode: number };
jobId: string;
jobType: string;
snapshotEnabled: boolean;
projectId: string | null;
workItemId: string | undefined;
}): Promise<void> {
const { container, result, jobId, jobType, snapshotEnabled, projectId, workItemId } = opts;

const { oomKilled, exitReason, durationMs } = await inspectExitedContainer(container, jobId);
await logWorkerTail(container);

if (result.StatusCode !== 0) {
captureException(new Error(`Worker exited with status ${result.StatusCode}`), {
tags: { source: 'worker_exit', jobType },
extra: { jobId, statusCode: result.StatusCode, oomKilled, exitReason, durationMs },
});
}
logger.info('[WorkerManager] Worker exited:', {
jobId,
statusCode: result.StatusCode,
oomKilled: oomKilled ?? null,
exitReason: exitReason ?? null,
durationMs: durationMs ?? null,
});

if (snapshotEnabled) {
if (result.StatusCode === 0 && projectId && workItemId) {
await commitContainerToSnapshot(container.id, projectId, workItemId);
} else if (result.StatusCode !== 0) {
logger.info('[WorkerManager] Skipping snapshot commit after non-zero exit:', {
jobId,
statusCode: result.StatusCode,
});
}
// Always remove manually since AutoRemove is disabled for snapshot runs.
await removeContainer(container.id);
}

cleanupWorker(jobId, result.StatusCode, { oomKilled, exitReason });
}

interface SpawnSettings {
snapshotEnabled: boolean;
workerImage: string;
Expand All @@ -148,8 +265,10 @@ interface SpawnSettings {
/**
* Resolve per-project spawn settings (snapshot flag, image, timeout).
* Centralises all loadProjectConfig() calls so spawnWorker stays simple.
*
* @internal Exported for unit testing only — call `spawnWorker` from app code.
*/
async function resolveSpawnSettings(
export async function resolveSpawnSettings(
projectId: string | null,
workItemId: string | undefined,
jobId: string,
Expand Down Expand Up @@ -196,6 +315,21 @@ async function resolveSpawnSettings(
containerTimeoutMs = projectCfg.watchdogTimeoutMs + ROUTER_KILL_BUFFER_MS;
}

// Trace-log the actual values that will govern this worker's lifetime so a
// post-mortem can confirm whether the project's watchdogTimeoutMs override
// took effect or the global default leaked through.
logger.info('[WorkerManager] Resolved spawn settings:', {
jobId,
projectId,
workItemId,
workerImage,
snapshotEnabled,
containerTimeoutMs,
containerTimeoutMinutes: Math.round(containerTimeoutMs / 60_000),
projectWatchdogTimeoutMs: projectCfg?.watchdogTimeoutMs ?? null,
globalWorkerTimeoutMs: routerConfig.workerTimeoutMs,
});

return { snapshotEnabled, workerImage, containerTimeoutMs, snapshotTtlMs };
}

Expand Down Expand Up @@ -299,54 +433,15 @@ async function createAndMonitorContainer(
container
.wait()
.then(async (result) => {
// Collect worker logs before removal
try {
const logs = await container.logs({
stdout: true,
stderr: true,
follow: false,
});
const logText = logs.toString('utf-8');
if (logText.trim()) {
const lines = logText.trim().split('\n');
const tail = lines.slice(-50).join('\n');
logger.info(
`[WorkerManager] Worker logs (last ${Math.min(lines.length, 50)} of ${lines.length} lines):\n${tail}`,
);
}
} catch {
// Container may already be removed — expected with AutoRemove
}

if (result.StatusCode !== 0) {
captureException(new Error(`Worker exited with status ${result.StatusCode}`), {
tags: { source: 'worker_exit', jobType: job.data.type },
extra: { jobId, statusCode: result.StatusCode },
});
}
logger.info('[WorkerManager] Worker exited:', {
await onWorkerExit({
container,
result,
jobId,
statusCode: result.StatusCode,
jobType: job.data.type,
snapshotEnabled,
projectId,
workItemId,
});

// For snapshot-enabled runs, commit on clean exit and then remove the container.
// Failed or timed-out runs must NOT create a snapshot.
// Always remove — even when projectId/workItemId are absent — to avoid
// orphaning containers that ran with AutoRemove=false.
if (snapshotEnabled) {
if (result.StatusCode === 0 && projectId && workItemId) {
await commitContainerToSnapshot(container.id, projectId, workItemId);
} else if (result.StatusCode !== 0) {
logger.info('[WorkerManager] Skipping snapshot commit after non-zero exit:', {
jobId,
statusCode: result.StatusCode,
});
}
// Always remove the container manually since AutoRemove is disabled
await removeContainer(container.id);
}

cleanupWorker(jobId, result.StatusCode);
})
.catch((err) => {
logger.error('[WorkerManager] Error waiting for container:', err);
Expand Down
Loading
Loading