From 87d5767234a820404b742589b62e3e29045d64d2 Mon Sep 17 00:00:00 2001 From: Zbigniew Sobiecki Date: Thu, 16 Apr 2026 18:16:35 +0000 Subject: [PATCH] fix(snapshots): actually rmi evicted images + reconcile orphans on startup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The router's [SnapshotCleanup] loop has been a sham since it shipped: evictSnapshots() removes entries from an in-memory Map and that's it. No docker rmi anywhere in the codebase (verified — whole-source grep returns zero matches). The Map dies on every router restart, so every snapshot image for a work item that never re-runs is orphaned forever. Symptom that surfaced: dev disk filled to 100% (50MB free of 436GB), with 13 cascade-snapshot-llmist-* images on disk totaling ~40GB. Six of those dated to March 25 — three weeks past the 24-hour TTL the "cleanup" loop had been pretending to enforce. Also: snapshotMaxSizeBytes (10GB) eviction has never fired because no caller passed imageSizeBytes to registerSnapshot — container-manager:91 called it with three args, leaving size at 0 in the Map so the size phase always thought the registry was empty. Three narrow fixes that together make the configured TTL/max-count/ max-size limits actually mean something on disk: 1. evictSnapshots now returns the array of evicted SnapshotMetadata instead of a count. snapshot-cleanup.runSnapshotCleanup() iterates that list and calls docker.getImage(name).remove({ force: false }). `force: false` so an in-use image survives (Docker returns 409 — we swallow it). 404 means already gone — also swallowed. Anything else logs warn + sentry capture. 2. New snapshot-startup-sync module: on router boot, list every cascade-snapshot-* image via docker.listImages, register each as a "discovered" entry in the in-memory Map (synthetic project key `__discovered__` to avoid colliding with real registrations), then immediately runSnapshotCleanup so TTL/max-count/max-size apply right away. Wired from worker-manager.startWorkerProcessor() as a best-effort post-startup step. This is what would have caught the six March images after the post-Linear-migration router restart. 3. commitContainerToSnapshot now inspects the freshly committed image and passes Size to registerSnapshot, so the max-size eviction phase actually has data to work with. Inspect failures don't block the registration — they just leave imageSizeBytes undefined, which falls back to TTL/max-count enforcement. Tests: extended snapshot-manager.test.ts (return-type change + new registerDiscoveredSnapshot dedup tests), rewrote snapshot-cleanup.test.ts to mock dockerode and assert image.remove + 409/404 swallow + sentry on unexpected error, added snapshot-startup-sync.test.ts (NEW), updated snapshot-integration.test.ts to expect the size 4th arg on registerSnapshot. All 7830 unit + 524 integration tests pass. Out of scope: backfilling the 40GB on disk (cleaned manually with docker prune; the next router restart will keep on-disk state under SNAPSHOT_MAX_COUNT/SNAPSHOT_MAX_SIZE_BYTES going forward). The 119GB build cache hoarding is a separate operational concern (BuildKit GC config, not a Cascade-level bug). Co-Authored-By: Claude Opus 4.6 (1M context) --- src/router/container-manager.ts | 18 +- src/router/snapshot-cleanup.ts | 76 ++++++- src/router/snapshot-manager.ts | 73 ++++++- src/router/snapshot-startup-sync.ts | 67 ++++++ src/router/worker-manager.ts | 10 + tests/unit/router/snapshot-cleanup.test.ts | 152 ++++++++++++-- .../unit/router/snapshot-integration.test.ts | 22 ++ tests/unit/router/snapshot-manager.test.ts | 77 ++++++- .../unit/router/snapshot-startup-sync.test.ts | 192 ++++++++++++++++++ 9 files changed, 636 insertions(+), 51 deletions(-) create mode 100644 src/router/snapshot-startup-sync.ts create mode 100644 tests/unit/router/snapshot-startup-sync.test.ts diff --git a/src/router/container-manager.ts b/src/router/container-manager.ts index 029a0be0..cdb7894a 100644 --- a/src/router/container-manager.ts +++ b/src/router/container-manager.ts @@ -79,6 +79,17 @@ function buildSnapshotImageName(projectId: string, workItemId: string): string { * On failure the error is logged and swallowed — snapshot failure must not * break the normal post-run flow. */ +async function inspectImageSizeBestEffort(imageName: string): Promise { + try { + const image = docker.getImage(imageName); + if (!image) return undefined; + const info = (await image.inspect()) as { Size?: number } | undefined; + return info?.Size; + } catch { + return undefined; + } +} + async function commitContainerToSnapshot( containerId: string, projectId: string, @@ -88,12 +99,17 @@ async function commitContainerToSnapshot( try { const container = docker.getContainer(containerId); await container.commit({ repo: imageName.split(':')[0], tag: 'latest' }); - registerSnapshot(projectId, workItemId, imageName); + // Populate the image size on the registered metadata so max-size + // eviction actually fires. Inspecting is best-effort — without size, + // the entry still gets TTL/max-count eviction. + const imageSize = await inspectImageSizeBestEffort(imageName); + registerSnapshot(projectId, workItemId, imageName, imageSize); logger.info('[WorkerManager] Committed container to snapshot image:', { containerId: containerId.slice(0, 12), imageName, projectId, workItemId, + imageSizeBytes: imageSize, }); } catch (err) { logger.warn('[WorkerManager] Failed to commit container to snapshot (non-fatal):', { diff --git a/src/router/snapshot-cleanup.ts b/src/router/snapshot-cleanup.ts index fcfa5842..210ef434 100644 --- a/src/router/snapshot-cleanup.ts +++ b/src/router/snapshot-cleanup.ts @@ -4,19 +4,23 @@ * Runs alongside the existing orphan cleanup loop (orphan-cleanup.ts) and * uses the same start/stop lifecycle pattern. On each tick it calls * evictSnapshots() to enforce the per-project TTL and global max-count / - * max-size budget limits. + * max-size budget limits, then `docker rmi`s every evicted entry's image. * - * This module owns only the timer — no Docker API usage. The actual eviction - * logic lives in snapshot-manager.ts. + * The Docker rmi step is critical: prior to PR #1132 the eviction loop only + * cleared the in-memory metadata Map and never freed the underlying images, + * which leaked ~3 GB per work item until the host disk filled. */ +import Docker from 'dockerode'; import { captureException } from '../sentry.js'; import { logger } from '../utils/logging.js'; import { routerConfig } from './config.js'; -import { evictSnapshots } from './snapshot-manager.js'; +import { evictSnapshots, type SnapshotMetadata } from './snapshot-manager.js'; const SNAPSHOT_CLEANUP_INTERVAL_MS = 5 * 60 * 1000; // 5 minutes +const docker = new Docker(); + /** Periodic snapshot cleanup timer */ let snapshotCleanupTimer: NodeJS.Timeout | null = null; @@ -56,9 +60,61 @@ export function stopSnapshotCleanup(): void { } } +interface DockerErrorShape { + statusCode?: number; +} + +function dockerStatusCode(err: unknown): number | undefined { + if (err && typeof err === 'object' && 'statusCode' in err) { + const code = (err as DockerErrorShape).statusCode; + return typeof code === 'number' ? code : undefined; + } + return undefined; +} + +/** + * Remove a snapshot image from Docker. `force: false` so an image still backing + * a running container is preserved (Docker returns 409). 404 means the image + * has already been removed by some other path. Both are harmless and silent. + */ +async function removeSnapshotImage(metadata: SnapshotMetadata): Promise { + try { + await docker.getImage(metadata.imageName).remove({ force: false }); + logger.info('[SnapshotCleanup] Removed snapshot image:', { + imageName: metadata.imageName, + }); + } catch (err: unknown) { + const status = dockerStatusCode(err); + if (status === 409) { + logger.debug('[SnapshotCleanup] Snapshot image in use, deferring:', { + imageName: metadata.imageName, + }); + return; + } + if (status === 404) { + logger.debug('[SnapshotCleanup] Snapshot image already gone:', { + imageName: metadata.imageName, + }); + return; + } + logger.warn('[SnapshotCleanup] Failed to remove snapshot image:', { + imageName: metadata.imageName, + error: String(err), + }); + captureException(err, { + tags: { source: 'snapshot_image_remove' }, + extra: { imageName: metadata.imageName }, + level: 'warning', + }); + } +} + /** - * Run a single snapshot eviction sweep using the global config limits. - * Exposed for testing and for manual invocation. + * Run a single snapshot eviction sweep using the global config limits, then + * `docker rmi` each evicted image. + * + * Exposed for testing and for manual invocation (e.g. immediately after + * startup-sync registers orphan images). * @internal Exported for testing */ export async function runSnapshotCleanup(): Promise { @@ -68,7 +124,9 @@ export async function runSnapshotCleanup(): Promise { routerConfig.snapshotMaxSizeBytes, ); - if (evicted > 0) { - logger.info('[SnapshotCleanup] Snapshot cleanup scan removed entries:', { evicted }); - } + if (evicted.length === 0) return; + + await Promise.all(evicted.map(removeSnapshotImage)); + + logger.info('[SnapshotCleanup] Cleanup pass complete:', { count: evicted.length }); } diff --git a/src/router/snapshot-manager.ts b/src/router/snapshot-manager.ts index a9db6734..7559d8f0 100644 --- a/src/router/snapshot-manager.ts +++ b/src/router/snapshot-manager.ts @@ -37,13 +37,24 @@ export interface SnapshotMetadata { /** In-memory snapshot registry keyed by `${projectId}:${workItemId}` */ const snapshots = new Map(); +/** Synthetic projectId used for entries discovered on disk at startup. */ +const DISCOVERED_PROJECT_ID = '__discovered__'; + function snapshotKey(projectId: string, workItemId: string): string { return `${projectId}:${workItemId}`; } +function discoveredKey(imageName: string): string { + return snapshotKey(DISCOVERED_PROJECT_ID, imageName); +} + /** * Register or refresh snapshot metadata for a project+workItem pair. * Overwrites any existing entry for the same key. + * + * Also drops any "discovered" entry that points to the same image, so the + * startup-sync orphan tracking doesn't double-count an image that's now + * being actively managed. */ export function registerSnapshot( projectId: string, @@ -60,6 +71,9 @@ export function registerSnapshot( imageSizeBytes, }; snapshots.set(key, metadata); + // Drop any orphan-tracking entry for the same image — the real registration + // supersedes it. + snapshots.delete(discoveredKey(imageName)); logger.info('[SnapshotManager] Snapshot registered:', { projectId, workItemId, @@ -68,6 +82,43 @@ export function registerSnapshot( return metadata; } +/** + * Track a snapshot image discovered on disk at startup so the cleanup loop + * can apply TTL/max-count/max-size limits to it. + * + * The (projectId, workItemId) pair cannot be reliably parsed from the + * sanitised composite image name — sanitisation collapses runs of '-' so + * `cascade-snapshot-llmist-mng-93:latest` is genuinely ambiguous. We sidestep + * that by keying discovered entries on the image name itself under a synthetic + * project (`__discovered__`). Eviction works the same way regardless of key. + * + * No-op when an entry already exists for this image, either as a discovered + * orphan or as a real registration. + */ +export function registerDiscoveredSnapshot( + imageName: string, + createdAt: Date, + imageSizeBytes: number, +): void { + const key = discoveredKey(imageName); + if (snapshots.has(key)) return; + for (const m of snapshots.values()) { + if (m.imageName === imageName) return; + } + snapshots.set(key, { + imageName, + projectId: DISCOVERED_PROJECT_ID, + workItemId: imageName, + createdAt, + imageSizeBytes, + }); + logger.debug('[SnapshotManager] Discovered orphan snapshot tracked:', { + imageName, + createdAt: createdAt.toISOString(), + imageSizeBytes, + }); +} + /** * Look up snapshot metadata for a project+workItem pair. * Returns undefined if no snapshot exists or if the snapshot has exceeded the @@ -135,17 +186,17 @@ export function getSnapshotCount(): number { * 3. Max-size: if still over-budget, remove oldest entries until estimated * total size is at or below snapshotMaxSizeBytes. * - * Returns the number of entries removed. - * - * This function operates only on the in-memory metadata registry. It does NOT - * remove Docker images — callers are responsible for any Docker cleanup. + * Returns the metadata of every evicted entry so the caller can do the + * matching `docker rmi`. Removing the in-memory entry without removing the + * underlying image (the prior behaviour of this function) is the leak that + * filled the dev disk to 100% — callers MUST act on the returned list. */ export function evictSnapshots( ttlMs: number = routerConfig.snapshotDefaultTtlMs, maxCount: number = routerConfig.snapshotMaxCount, maxSizeBytes: number = routerConfig.snapshotMaxSizeBytes, -): number { - let evicted = 0; +): SnapshotMetadata[] { + const evicted: SnapshotMetadata[] = []; const now = Date.now(); // Phase 1: TTL eviction — remove all expired entries @@ -153,7 +204,7 @@ export function evictSnapshots( const ageMs = now - metadata.createdAt.getTime(); if (ageMs > ttlMs) { snapshots.delete(key); - evicted++; + evicted.push(metadata); logger.info('[SnapshotManager] Evicted expired snapshot:', { projectId: metadata.projectId, workItemId: metadata.workItemId, @@ -172,7 +223,7 @@ export function evictSnapshots( for (let i = 0; i < toRemove; i++) { const [key, metadata] = sorted[i]; snapshots.delete(key); - evicted++; + evicted.push(metadata); logger.info('[SnapshotManager] Evicted snapshot (over max-count):', { projectId: metadata.projectId, workItemId: metadata.workItemId, @@ -195,7 +246,7 @@ export function evictSnapshots( for (const [key, metadata] of sorted) { if (runningSize <= maxSizeBytes) break; snapshots.delete(key); - evicted++; + evicted.push(metadata); runningSize -= metadata.imageSizeBytes ?? 0; logger.info('[SnapshotManager] Evicted snapshot (over max-size):', { projectId: metadata.projectId, @@ -207,9 +258,9 @@ export function evictSnapshots( } } - if (evicted > 0) { + if (evicted.length > 0) { logger.info('[SnapshotManager] Eviction sweep complete:', { - evicted, + evicted: evicted.length, remaining: snapshots.size, }); } diff --git a/src/router/snapshot-startup-sync.ts b/src/router/snapshot-startup-sync.ts new file mode 100644 index 00000000..5bafa14d --- /dev/null +++ b/src/router/snapshot-startup-sync.ts @@ -0,0 +1,67 @@ +/** + * Snapshot startup reconciliation. + * + * Called once at router boot. Lists all `cascade-snapshot-*` images currently + * on disk and registers each one as a "discovered" snapshot in the in-memory + * registry, so the regular cleanup loop can apply TTL/max-count/max-size + * policies to them. + * + * Without this, every snapshot for a work item that never re-runs is orphaned + * forever (the in-memory registry is process-local; restarts wipe it). Exactly + * the leak that filled the dev disk to 100% with 40 GB of three-week-old + * llmist Trello snapshots. + * + * Best-effort: a Docker outage at boot must not block router startup. + */ + +import Docker from 'dockerode'; +import { captureException } from '../sentry.js'; +import { logger } from '../utils/logging.js'; +import { runSnapshotCleanup } from './snapshot-cleanup.js'; +import { registerDiscoveredSnapshot } from './snapshot-manager.js'; + +const SNAPSHOT_IMAGE_PREFIX = 'cascade-snapshot-'; + +const docker = new Docker(); + +interface DockerImageSummary { + RepoTags?: string[] | null; + Created: number; + Size: number; +} + +function isCascadeSnapshotTag(tag: string): boolean { + return tag.startsWith(SNAPSHOT_IMAGE_PREFIX); +} + +/** + * Discover existing snapshot images on disk and register them. Always runs the + * cleanup sweep at the end so TTL/max-count/max-size policies apply + * immediately to whatever was just registered (and to anything left over from + * a previous run that the registry already knew about). + */ +export async function syncSnapshotsFromDocker(): Promise { + let registered = 0; + try { + const images = (await docker.listImages()) as DockerImageSummary[]; + for (const img of images) { + const tags = img.RepoTags ?? []; + for (const tag of tags) { + if (!isCascadeSnapshotTag(tag)) continue; + registerDiscoveredSnapshot(tag, new Date(img.Created * 1000), img.Size); + registered++; + } + } + logger.info('[SnapshotStartupSync] Reconciled snapshot images from Docker:', { registered }); + } catch (err) { + logger.warn('[SnapshotStartupSync] Failed to sync snapshots from Docker:', { + error: String(err), + }); + captureException(err, { + tags: { source: 'snapshot_startup_sync' }, + level: 'warning', + }); + } + + await runSnapshotCleanup(); +} diff --git a/src/router/worker-manager.ts b/src/router/worker-manager.ts index 63054f01..11709767 100644 --- a/src/router/worker-manager.ts +++ b/src/router/worker-manager.ts @@ -21,6 +21,7 @@ import { } from './container-manager.js'; import type { CascadeJob } from './queue.js'; import { startSnapshotCleanup, stopSnapshotCleanup } from './snapshot-cleanup.js'; +import { syncSnapshotsFromDocker } from './snapshot-startup-sync.js'; // Re-export container-manager public API so existing callers are unaffected. export { getActiveWorkerCount, getActiveWorkers, startOrphanCleanup, stopOrphanCleanup }; @@ -82,6 +83,15 @@ export function startWorkerProcessor(): void { // Start periodic snapshot eviction alongside orphan cleanup startSnapshotCleanup(); + // Reconcile pre-existing snapshot images on disk so the eviction loop can + // apply TTL/max-count/max-size policies to them. Best-effort — Docker + // outage at boot must not block the worker manager from starting. + void syncSnapshotsFromDocker().catch((err) => { + logger.warn('[WorkerManager] Snapshot startup sync failed (continuing):', { + error: String(err), + }); + }); + logger.info('[WorkerManager] Started with max', routerConfig.maxWorkers, 'concurrent workers'); } diff --git a/tests/unit/router/snapshot-cleanup.test.ts b/tests/unit/router/snapshot-cleanup.test.ts index 337670b9..a19249bf 100644 --- a/tests/unit/router/snapshot-cleanup.test.ts +++ b/tests/unit/router/snapshot-cleanup.test.ts @@ -4,25 +4,42 @@ import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; // Hoisted mock state // --------------------------------------------------------------------------- -const { mockEvictSnapshots } = vi.hoisted(() => ({ - mockEvictSnapshots: vi.fn().mockReturnValue(0), -})); +const { + mockEvictSnapshots, + mockDockerGetImage, + mockImageRemove, + mockCaptureException, + mockLogger, +} = vi.hoisted(() => { + const mockImageRemove = vi.fn().mockResolvedValue(undefined); + return { + mockEvictSnapshots: vi.fn().mockReturnValue([]), + mockDockerGetImage: vi.fn().mockReturnValue({ remove: mockImageRemove }), + mockImageRemove, + mockCaptureException: vi.fn(), + mockLogger: { + info: vi.fn(), + warn: vi.fn(), + error: vi.fn(), + debug: vi.fn(), + }, + }; +}); // --------------------------------------------------------------------------- // Module-level mocks // --------------------------------------------------------------------------- +vi.mock('dockerode', () => ({ + default: vi.fn().mockImplementation(() => ({ getImage: mockDockerGetImage })), +})); + vi.mock('../../../src/utils/logging.js', () => ({ - logger: { - info: vi.fn(), - warn: vi.fn(), - error: vi.fn(), - debug: vi.fn(), - }, + logger: mockLogger, })); vi.mock('../../../src/sentry.js', () => ({ - captureException: vi.fn(), + captureException: mockCaptureException, })); vi.mock('../../../src/router/config.js', () => ({ @@ -47,22 +64,50 @@ import { stopSnapshotCleanup, } from '../../../src/router/snapshot-cleanup.js'; +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +interface DockerErrorShape { + statusCode?: number; + message?: string; +} + +function makeDockerError(statusCode: number, message: string): Error & DockerErrorShape { + const err = new Error(message) as Error & DockerErrorShape; + err.statusCode = statusCode; + return err; +} + +function makeMetadata(overrides: Partial<{ imageName: string; size: number }> = {}) { + return { + imageName: overrides.imageName ?? 'cascade-snapshot-proj-card:latest', + projectId: 'proj', + workItemId: 'card', + createdAt: new Date(), + imageSizeBytes: overrides.size ?? 100, + }; +} + // --------------------------------------------------------------------------- // Tests // --------------------------------------------------------------------------- describe('snapshot-cleanup', () => { beforeEach(() => { - vi.spyOn(console, 'log').mockImplementation(() => {}); - vi.spyOn(console, 'info').mockImplementation(() => {}); - vi.spyOn(console, 'warn').mockImplementation(() => {}); - vi.spyOn(console, 'error').mockImplementation(() => {}); mockEvictSnapshots.mockClear(); - mockEvictSnapshots.mockReturnValue(0); + mockDockerGetImage.mockClear(); + mockImageRemove.mockReset(); + mockImageRemove.mockResolvedValue(undefined); + mockCaptureException.mockClear(); + mockLogger.info.mockClear(); + mockLogger.warn.mockClear(); + mockLogger.error.mockClear(); + mockLogger.debug.mockClear(); + mockEvictSnapshots.mockReturnValue([]); }); afterEach(() => { - vi.restoreAllMocks(); stopSnapshotCleanup(); }); @@ -102,7 +147,7 @@ describe('snapshot-cleanup', () => { }); // ------------------------------------------------------------------------- - // runSnapshotCleanup + // runSnapshotCleanup — invocation // ------------------------------------------------------------------------- describe('runSnapshotCleanup', () => { @@ -116,14 +161,79 @@ describe('snapshot-cleanup', () => { ); }); - it('resolves without throwing when evictSnapshots returns 0', async () => { - mockEvictSnapshots.mockReturnValue(0); + it('resolves without throwing when evictSnapshots returns no entries', async () => { + mockEvictSnapshots.mockReturnValue([]); + await expect(runSnapshotCleanup()).resolves.toBeUndefined(); + expect(mockDockerGetImage).not.toHaveBeenCalled(); + }); + }); + + // ------------------------------------------------------------------------- + // runSnapshotCleanup — Docker rmi behavior + // ------------------------------------------------------------------------- + + describe('runSnapshotCleanup — actually removes Docker images', () => { + it('calls docker.getImage().remove({ force: false }) for each evicted entry', async () => { + mockEvictSnapshots.mockReturnValue([ + makeMetadata({ imageName: 'cascade-snapshot-a:latest' }), + makeMetadata({ imageName: 'cascade-snapshot-b:latest' }), + ]); + + await runSnapshotCleanup(); + + expect(mockDockerGetImage).toHaveBeenCalledWith('cascade-snapshot-a:latest'); + expect(mockDockerGetImage).toHaveBeenCalledWith('cascade-snapshot-b:latest'); + expect(mockImageRemove).toHaveBeenCalledTimes(2); + expect(mockImageRemove).toHaveBeenCalledWith({ force: false }); + }); + + it('swallows 409 (image in use) without warning or sentry capture', async () => { + mockEvictSnapshots.mockReturnValue([makeMetadata()]); + mockImageRemove.mockRejectedValueOnce(makeDockerError(409, 'image is in use by container')); + + await expect(runSnapshotCleanup()).resolves.toBeUndefined(); + + expect(mockLogger.warn).not.toHaveBeenCalled(); + expect(mockCaptureException).not.toHaveBeenCalled(); + }); + + it('swallows 404 (image already gone) without warning or sentry capture', async () => { + mockEvictSnapshots.mockReturnValue([makeMetadata()]); + mockImageRemove.mockRejectedValueOnce(makeDockerError(404, 'no such image')); + await expect(runSnapshotCleanup()).resolves.toBeUndefined(); + + expect(mockLogger.warn).not.toHaveBeenCalled(); + expect(mockCaptureException).not.toHaveBeenCalled(); }); - it('resolves without throwing when evictSnapshots removes entries', async () => { - mockEvictSnapshots.mockReturnValue(3); + it('logs a warning and captures Sentry exception on unexpected error', async () => { + mockEvictSnapshots.mockReturnValue([makeMetadata()]); + mockImageRemove.mockRejectedValueOnce(new Error('docker daemon down')); + await expect(runSnapshotCleanup()).resolves.toBeUndefined(); + + expect(mockLogger.warn).toHaveBeenCalledWith( + expect.stringContaining('Failed to remove snapshot image'), + expect.objectContaining({ imageName: expect.any(String) }), + ); + expect(mockCaptureException).toHaveBeenCalled(); + }); + + it('continues processing remaining entries when one removal fails', async () => { + mockEvictSnapshots.mockReturnValue([ + makeMetadata({ imageName: 'a:latest' }), + makeMetadata({ imageName: 'b:latest' }), + makeMetadata({ imageName: 'c:latest' }), + ]); + mockImageRemove + .mockResolvedValueOnce(undefined) + .mockRejectedValueOnce(new Error('boom')) + .mockResolvedValueOnce(undefined); + + await runSnapshotCleanup(); + + expect(mockImageRemove).toHaveBeenCalledTimes(3); }); }); }); diff --git a/tests/unit/router/snapshot-integration.test.ts b/tests/unit/router/snapshot-integration.test.ts index d2129a93..66e78084 100644 --- a/tests/unit/router/snapshot-integration.test.ts +++ b/tests/unit/router/snapshot-integration.test.ts @@ -16,6 +16,7 @@ import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; const { mockDockerCreateContainer, mockDockerGetContainer, + mockDockerGetImage, mockLoadProjectConfig, mockGetSnapshot, mockRegisterSnapshot, @@ -23,6 +24,12 @@ const { } = vi.hoisted(() => ({ mockDockerCreateContainer: vi.fn(), mockDockerGetContainer: vi.fn(), + // commitContainerToSnapshot inspects the freshly committed image to + // populate imageSizeBytes; default to a fixed size so registerSnapshot + // receives a deterministic 4th argument. + mockDockerGetImage: vi.fn().mockReturnValue({ + inspect: vi.fn().mockResolvedValue({ Size: 1_234_567_890 }), + }), mockLoadProjectConfig: vi.fn().mockResolvedValue({ projects: [], fullProjects: [] }), mockGetSnapshot: vi.fn().mockReturnValue(undefined), mockRegisterSnapshot: vi.fn(), @@ -37,6 +44,7 @@ vi.mock('dockerode', () => ({ default: vi.fn().mockImplementation(() => ({ createContainer: mockDockerCreateContainer, getContainer: mockDockerGetContainer, + getImage: mockDockerGetImage, })), })); @@ -158,6 +166,19 @@ function setupMockContainer(exitCode = 0) { }; } +// --------------------------------------------------------------------------- +// File-wide setup — vi.restoreAllMocks() in per-describe afterEach hooks wipes +// mockReturnValue on hoisted mocks. Re-arm the docker getImage mock here so +// commitContainerToSnapshot's image-size lookup always resolves to a known +// value across every describe. +// --------------------------------------------------------------------------- + +beforeEach(() => { + mockDockerGetImage.mockReturnValue({ + inspect: vi.fn().mockResolvedValue({ Size: 1_234_567_890 }), + }); +}); + // --------------------------------------------------------------------------- // Tests // --------------------------------------------------------------------------- @@ -431,6 +452,7 @@ describe('spawnWorker — snapshot hit (existing snapshot)', () => { 'proj-snap', 'card-snap', expect.stringContaining('cascade-snapshot-proj-snap-card-snap'), + 1_234_567_890, // size from mockDockerGetImage's inspect mock ); }); }); diff --git a/tests/unit/router/snapshot-manager.test.ts b/tests/unit/router/snapshot-manager.test.ts index ed334d57..b0825ea7 100644 --- a/tests/unit/router/snapshot-manager.test.ts +++ b/tests/unit/router/snapshot-manager.test.ts @@ -237,8 +237,8 @@ describe('snapshot-manager', () => { // ------------------------------------------------------------------------- describe('evictSnapshots', () => { - it('returns 0 when no snapshots are registered', () => { - expect(evictSnapshots(1000, 5, 10 * 1024 * 1024 * 1024)).toBe(0); + it('returns an empty array when no snapshots are registered', () => { + expect(evictSnapshots(1000, 5, 10 * 1024 * 1024 * 1024)).toEqual([]); }); it('evicts expired snapshots by TTL', () => { @@ -251,7 +251,8 @@ describe('snapshot-manager', () => { const evicted = evictSnapshots(1000, 10, 10 * 1024 * 1024 * 1024); - expect(evicted).toBe(1); + expect(evicted).toHaveLength(1); + expect(evicted[0].imageName).toBe('img-1:latest'); expect(getSnapshotCount()).toBe(1); expect(getSnapshot('proj-1', 'card-2')).toBeDefined(); expect(getSnapshot('proj-1', 'card-1', 1000)).toBeUndefined(); @@ -270,7 +271,7 @@ describe('snapshot-manager', () => { // Allow all TTL, but cap at 2 snapshots const evicted = evictSnapshots(24 * 60 * 60 * 1000, 2, 10 * 1024 * 1024 * 1024); - expect(evicted).toBe(1); + expect(evicted).toHaveLength(1); expect(getSnapshotCount()).toBe(2); // s1 (oldest) should have been evicted expect(getSnapshot('proj-1', 'card-1')).toBeUndefined(); @@ -293,7 +294,7 @@ describe('snapshot-manager', () => { const evicted = evictSnapshots(24 * 60 * 60 * 1000, 100, 1100); // After removing s1 (500 bytes): 1000 <= 1100, done - expect(evicted).toBe(1); + expect(evicted).toHaveLength(1); expect(getSnapshotCount()).toBe(2); expect(getSnapshot('proj-1', 'card-1')).toBeUndefined(); expect(getSnapshot('proj-1', 'card-2')).toBeDefined(); @@ -314,7 +315,7 @@ describe('snapshot-manager', () => { const evicted = evictSnapshots(1000, 2, 10 * 1024 * 1024 * 1024); // Both expired, so 2 removed by TTL, count drops to 1 which is under maxCount=2 - expect(evicted).toBe(2); + expect(evicted).toHaveLength(2); expect(getSnapshotCount()).toBe(1); expect(getSnapshot('proj-1', 'card-3')).toBeDefined(); }); @@ -325,7 +326,7 @@ describe('snapshot-manager', () => { const evicted = evictSnapshots(24 * 60 * 60 * 1000, 10, 10 * 1024 * 1024 * 1024); - expect(evicted).toBe(0); + expect(evicted).toEqual([]); expect(getSnapshotCount()).toBe(2); }); @@ -341,7 +342,7 @@ describe('snapshot-manager', () => { // Snapshots without size contribute 0 — no eviction needed const evicted = evictSnapshots(24 * 60 * 60 * 1000, 100, 1000); - expect(evicted).toBe(0); + expect(evicted).toEqual([]); expect(getSnapshotCount()).toBe(2); }); @@ -357,8 +358,66 @@ describe('snapshot-manager', () => { // should use that default const evicted = evictSnapshots(); - expect(evicted).toBe(1); + expect(evicted).toHaveLength(1); expect(getSnapshotCount()).toBe(5); }); + + it('returns SnapshotMetadata for each evicted entry so the caller can rmi', () => { + const s1 = registerSnapshot('proj-1', 'card-old', 'img-old:latest', 1000); + s1.createdAt = new Date(Date.now() - 99_999_999); + + const evicted = evictSnapshots(1000, 100, 10 * 1024 * 1024 * 1024); + expect(evicted).toHaveLength(1); + expect(evicted[0]).toMatchObject({ + imageName: 'img-old:latest', + projectId: 'proj-1', + workItemId: 'card-old', + imageSizeBytes: 1000, + }); + }); + }); + + // ------------------------------------------------------------------------- + // registerDiscoveredSnapshot — startup-sync entry point + // ------------------------------------------------------------------------- + + describe('registerDiscoveredSnapshot', () => { + it('tracks an image found on disk so the cleanup loop can evict it', async () => { + const { registerDiscoveredSnapshot } = await import( + '../../../src/router/snapshot-manager.js' + ); + const old = new Date(Date.now() - 99_999_999); + registerDiscoveredSnapshot('cascade-snapshot-orphan:latest', old, 3_000_000_000); + + expect(getSnapshotCount()).toBe(1); + const evicted = evictSnapshots(1000, 100, 10 * 1024 * 1024 * 1024); + expect(evicted).toHaveLength(1); + expect(evicted[0].imageName).toBe('cascade-snapshot-orphan:latest'); + expect(evicted[0].imageSizeBytes).toBe(3_000_000_000); + }); + + it('does not duplicate when the same image is registered twice', async () => { + const { registerDiscoveredSnapshot } = await import( + '../../../src/router/snapshot-manager.js' + ); + const ts = new Date(); + registerDiscoveredSnapshot('cascade-snapshot-dup:latest', ts, 100); + registerDiscoveredSnapshot('cascade-snapshot-dup:latest', ts, 100); + expect(getSnapshotCount()).toBe(1); + }); + + it('is dropped when a real registerSnapshot lands the same image name', async () => { + const { registerDiscoveredSnapshot } = await import( + '../../../src/router/snapshot-manager.js' + ); + const old = new Date(Date.now() - 99_999_999); + registerDiscoveredSnapshot('cascade-snapshot-llmist-mng-95:latest', old, 100); + expect(getSnapshotCount()).toBe(1); + + registerSnapshot('llmist', 'mng-95', 'cascade-snapshot-llmist-mng-95:latest', 200); + // Discovered entry replaced; only the real entry remains. + expect(getSnapshotCount()).toBe(1); + expect(getSnapshot('llmist', 'mng-95')?.imageSizeBytes).toBe(200); + }); }); }); diff --git a/tests/unit/router/snapshot-startup-sync.test.ts b/tests/unit/router/snapshot-startup-sync.test.ts new file mode 100644 index 00000000..90dad2e1 --- /dev/null +++ b/tests/unit/router/snapshot-startup-sync.test.ts @@ -0,0 +1,192 @@ +/** + * Verifies that the router can recover from restart amnesia: on startup, + * any `cascade-snapshot-*` images already on disk get registered in the + * in-memory map (with their actual creation time + size) so the next + * eviction sweep can apply TTL/max-count/max-size limits to them. + * + * Without this, snapshot images for work items that never re-run pile up + * forever — exactly the leak that filled the dev disk to 100% (40 GB of + * orphan llmist snapshots dating back 3 weeks). + */ + +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; + +// --------------------------------------------------------------------------- +// Hoisted mock state +// --------------------------------------------------------------------------- + +const { + mockListImages, + mockDockerGetImage, + mockRegisterDiscoveredSnapshot, + mockRunSnapshotCleanup, + mockLogger, +} = vi.hoisted(() => { + const mockImageRemove = vi.fn().mockResolvedValue(undefined); + return { + mockListImages: vi.fn().mockResolvedValue([]), + mockDockerGetImage: vi.fn().mockReturnValue({ remove: mockImageRemove }), + mockRegisterDiscoveredSnapshot: vi.fn(), + mockRunSnapshotCleanup: vi.fn().mockResolvedValue(undefined), + mockLogger: { + info: vi.fn(), + warn: vi.fn(), + error: vi.fn(), + debug: vi.fn(), + }, + }; +}); + +// --------------------------------------------------------------------------- +// Module-level mocks +// --------------------------------------------------------------------------- + +vi.mock('dockerode', () => ({ + default: vi.fn().mockImplementation(() => ({ + listImages: mockListImages, + getImage: mockDockerGetImage, + })), +})); + +vi.mock('../../../src/utils/logging.js', () => ({ + logger: mockLogger, +})); + +vi.mock('../../../src/sentry.js', () => ({ + captureException: vi.fn(), +})); + +vi.mock('../../../src/router/snapshot-manager.js', () => ({ + registerDiscoveredSnapshot: (...args: unknown[]) => mockRegisterDiscoveredSnapshot(...args), +})); + +vi.mock('../../../src/router/snapshot-cleanup.js', () => ({ + runSnapshotCleanup: () => mockRunSnapshotCleanup(), +})); + +// --------------------------------------------------------------------------- +// Imports (after mocks) +// --------------------------------------------------------------------------- + +import { syncSnapshotsFromDocker } from '../../../src/router/snapshot-startup-sync.js'; + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +describe('snapshot-startup-sync', () => { + beforeEach(() => { + mockListImages.mockReset(); + mockListImages.mockResolvedValue([]); + mockRegisterDiscoveredSnapshot.mockClear(); + mockRunSnapshotCleanup.mockClear(); + mockLogger.info.mockClear(); + mockLogger.warn.mockClear(); + mockLogger.error.mockClear(); + }); + + afterEach(() => { + vi.clearAllMocks(); + }); + + it('registers every cascade-snapshot-* image found on disk with its actual creation time + size', async () => { + const t1 = 1700000000; + const t2 = 1700001000; + mockListImages.mockResolvedValueOnce([ + { + RepoTags: ['cascade-snapshot-llmist-mng-93:latest'], + Created: t1, + Size: 3_000_000_000, + }, + { + RepoTags: ['cascade-snapshot-llmist-mng-94:latest'], + Created: t2, + Size: 2_500_000_000, + }, + // Unrelated images are ignored. + { RepoTags: ['cascade-router:dev'], Created: t1, Size: 200_000_000 }, + { RepoTags: ['postgres:16-alpine'], Created: t1, Size: 100_000_000 }, + ]); + + await syncSnapshotsFromDocker(); + + expect(mockRegisterDiscoveredSnapshot).toHaveBeenCalledTimes(2); + expect(mockRegisterDiscoveredSnapshot).toHaveBeenCalledWith( + 'cascade-snapshot-llmist-mng-93:latest', + new Date(t1 * 1000), + 3_000_000_000, + ); + expect(mockRegisterDiscoveredSnapshot).toHaveBeenCalledWith( + 'cascade-snapshot-llmist-mng-94:latest', + new Date(t2 * 1000), + 2_500_000_000, + ); + }); + + it('runs the cleanup sweep immediately after registration so TTL applies to discovered images', async () => { + mockListImages.mockResolvedValueOnce([ + { + RepoTags: ['cascade-snapshot-old:latest'], + Created: 1700000000, + Size: 1_000_000_000, + }, + ]); + + await syncSnapshotsFromDocker(); + + expect(mockRunSnapshotCleanup).toHaveBeenCalledTimes(1); + // And the order matters: register BEFORE cleanup + const registerCallOrder = mockRegisterDiscoveredSnapshot.mock.invocationCallOrder[0]; + const cleanupCallOrder = mockRunSnapshotCleanup.mock.invocationCallOrder[0]; + expect(registerCallOrder).toBeLessThan(cleanupCallOrder); + }); + + it('handles images with multiple repo tags (registers each cascade-snapshot tag)', async () => { + mockListImages.mockResolvedValueOnce([ + { + RepoTags: ['cascade-snapshot-a:latest', 'cascade-snapshot-a:v1', 'random:tag'], + Created: 1700000000, + Size: 500_000_000, + }, + ]); + + await syncSnapshotsFromDocker(); + + // Both cascade-snapshot tags registered; random:tag ignored. + expect(mockRegisterDiscoveredSnapshot).toHaveBeenCalledTimes(2); + }); + + it('does not throw if Docker listImages itself fails (best-effort startup)', async () => { + mockListImages.mockRejectedValueOnce(new Error('docker daemon unreachable')); + + await expect(syncSnapshotsFromDocker()).resolves.toBeUndefined(); + expect(mockLogger.warn).toHaveBeenCalledWith( + expect.stringContaining('Failed to sync snapshots from Docker'), + expect.objectContaining({ error: expect.any(String) }), + ); + // Cleanup loop still runs against whatever's already in the registry. + expect(mockRunSnapshotCleanup).toHaveBeenCalledTimes(1); + }); + + it('is a no-op when no cascade-snapshot images exist', async () => { + mockListImages.mockResolvedValueOnce([ + { RepoTags: ['cascade-router:dev'], Created: 1700000000, Size: 200_000_000 }, + ]); + + await syncSnapshotsFromDocker(); + + expect(mockRegisterDiscoveredSnapshot).not.toHaveBeenCalled(); + // Cleanup still runs (idempotent, harmless). + expect(mockRunSnapshotCleanup).toHaveBeenCalledTimes(1); + }); + + it('handles images with null RepoTags gracefully', async () => { + mockListImages.mockResolvedValueOnce([ + { RepoTags: null, Created: 1700000000, Size: 100 }, + { RepoTags: undefined, Created: 1700000001, Size: 100 }, + ]); + + await expect(syncSnapshotsFromDocker()).resolves.toBeUndefined(); + expect(mockRegisterDiscoveredSnapshot).not.toHaveBeenCalled(); + }); +});