From 3c0c3996730b9f9828429fc5893e829993ea3124 Mon Sep 17 00:00:00 2001 From: Zbigniew Sobiecki Date: Wed, 29 Apr 2026 16:49:07 +0000 Subject: [PATCH] fix(router): rmi snapshot image on PR merge so it isn't leaked to disk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `pr-merged` previously called `invalidateSnapshot(projectId, workItemId)` (registry-only). The image lingered in Docker storage, and the periodic 5-minute cleanup loop iterates registry entries only — so the now-orphaned image was leaked permanently, freed only by `docker image prune -af` or host disk pressure. This is a primary contributor to the 2026-04-29 disk- fill incident on prod. Fix: split into two pieces. - `invalidateSnapshot(projectId, workItemId)` now RETURNS the removed metadata (or undefined) so any caller can `docker rmi` the underlying image. Removing the registry entry without the image is the leak; the return shape forces callers to think about the image. - New helper `invalidateAndRemoveSnapshot(projectId, workItemId)` in `snapshot-cleanup.ts` chains `invalidateSnapshot` + the existing `removeSnapshotImage` (which gracefully handles 404 / 409 + Sentry- captures unexpected docker errors). `pr-merged.ts` calls this helper as fire-and-forget (`void ... .catch(logger.warn)`) — preserves the non-blocking property of the original call site. `removeSnapshotImage` is now exported (was module-private) so the new helper can reuse it without duplicating the docker rmi block. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/router/snapshot-cleanup.ts | 24 ++++++++- src/router/snapshot-manager.ts | 18 +++++-- src/triggers/github/pr-merged.ts | 17 +++++-- tests/unit/router/snapshot-cleanup.test.ts | 59 ++++++++++++++++++++++ tests/unit/router/snapshot-manager.test.ts | 21 ++++++++ tests/unit/triggers/pr-merged.test.ts | 21 +++++--- 6 files changed, 141 insertions(+), 19 deletions(-) diff --git a/src/router/snapshot-cleanup.ts b/src/router/snapshot-cleanup.ts index 210ef434..8d8d5179 100644 --- a/src/router/snapshot-cleanup.ts +++ b/src/router/snapshot-cleanup.ts @@ -15,7 +15,7 @@ import Docker from 'dockerode'; import { captureException } from '../sentry.js'; import { logger } from '../utils/logging.js'; import { routerConfig } from './config.js'; -import { evictSnapshots, type SnapshotMetadata } from './snapshot-manager.js'; +import { evictSnapshots, invalidateSnapshot, type SnapshotMetadata } from './snapshot-manager.js'; const SNAPSHOT_CLEANUP_INTERVAL_MS = 5 * 60 * 1000; // 5 minutes @@ -77,7 +77,7 @@ function dockerStatusCode(err: unknown): number | undefined { * a running container is preserved (Docker returns 409). 404 means the image * has already been removed by some other path. Both are harmless and silent. */ -async function removeSnapshotImage(metadata: SnapshotMetadata): Promise { +export async function removeSnapshotImage(metadata: SnapshotMetadata): Promise { try { await docker.getImage(metadata.imageName).remove({ force: false }); logger.info('[SnapshotCleanup] Removed snapshot image:', { @@ -130,3 +130,23 @@ export async function runSnapshotCleanup(): Promise { logger.info('[SnapshotCleanup] Cleanup pass complete:', { count: evicted.length }); } + +/** + * Eagerly invalidate a snapshot AND remove its Docker image. Called on PR + * merge: the snapshot was built for a specific work-item state and is no + * longer valid. Without this, `invalidateSnapshot` alone clears the registry + * entry but leaves the image in Docker storage — and the periodic 5-min + * cleanup loop iterates registry entries only, so the orphaned image is + * leaked permanently. That leak is what filled prod disk to 100% on + * 2026-04-29. + * + * Safe to call when no snapshot exists for the pair (no-op). + */ +export async function invalidateAndRemoveSnapshot( + projectId: string, + workItemId: string, +): Promise { + const removed = invalidateSnapshot(projectId, workItemId); + if (!removed) return; + await removeSnapshotImage(removed); +} diff --git a/src/router/snapshot-manager.ts b/src/router/snapshot-manager.ts index 7559d8f0..6669229f 100644 --- a/src/router/snapshot-manager.ts +++ b/src/router/snapshot-manager.ts @@ -155,17 +155,27 @@ export function getSnapshot( /** * Invalidate (remove) snapshot metadata for a project+workItem pair. - * Safe to call even if no snapshot exists. + * Returns the removed metadata so the caller can `docker rmi` the underlying + * image. Returns undefined when no entry was registered. Removing the in-memory + * entry without removing the image would orphan the image (the periodic cleanup + * loop iterates registry entries only) — callers MUST act on the returned + * metadata. See snapshot-cleanup.invalidateAndRemoveSnapshot for the canonical + * caller. */ -export function invalidateSnapshot(projectId: string, workItemId: string): void { +export function invalidateSnapshot( + projectId: string, + workItemId: string, +): SnapshotMetadata | undefined { const key = snapshotKey(projectId, workItemId); - const hadEntry = snapshots.delete(key); - if (hadEntry) { + const removed = snapshots.get(key); + snapshots.delete(key); + if (removed) { logger.info('[SnapshotManager] Snapshot invalidated:', { projectId, workItemId, }); } + return removed; } /** diff --git a/src/triggers/github/pr-merged.ts b/src/triggers/github/pr-merged.ts index 0d145051..9d86b0d7 100644 --- a/src/triggers/github/pr-merged.ts +++ b/src/triggers/github/pr-merged.ts @@ -1,7 +1,7 @@ import { githubClient } from '../../github/client.js'; import { getPMProvider } from '../../pm/context.js'; import { resolveProjectPMConfig } from '../../pm/lifecycle.js'; -import { invalidateSnapshot } from '../../router/snapshot-manager.js'; +import { invalidateAndRemoveSnapshot } from '../../router/snapshot-cleanup.js'; import type { TriggerContext, TriggerHandler, TriggerResult } from '../../types/index.js'; import { logger } from '../../utils/logging.js'; import { parseRepoFullName } from '../../utils/repo.js'; @@ -47,10 +47,17 @@ export class PRMergedTrigger implements TriggerHandler { return null; } - // Fire-and-forget: invalidate any stale snapshot for this work item now that - // the PR is merged. The snapshot was built for a specific state of the work - // item and is no longer valid after the work is done. - invalidateSnapshot(ctx.project.id, workItemId); + // Fire-and-forget: invalidate the registry entry AND `docker rmi` the + // underlying image. Registry-only invalidation orphans the image (the + // periodic cleanup loop iterates registry entries) — see 2026-04-29 + // disk-fill incident. + void invalidateAndRemoveSnapshot(ctx.project.id, workItemId).catch((err: unknown) => { + logger.warn('Failed to invalidate+remove snapshot on PR merge', { + projectId: ctx.project.id, + workItemId, + error: String(err), + }); + }); const pmConfig = resolveProjectPMConfig(ctx.project); const mergedStatus = pmConfig.statuses.merged; diff --git a/tests/unit/router/snapshot-cleanup.test.ts b/tests/unit/router/snapshot-cleanup.test.ts index a19249bf..a2847ce2 100644 --- a/tests/unit/router/snapshot-cleanup.test.ts +++ b/tests/unit/router/snapshot-cleanup.test.ts @@ -50,8 +50,11 @@ vi.mock('../../../src/router/config.js', () => ({ }, })); +const mockInvalidateSnapshot = vi.fn(); + vi.mock('../../../src/router/snapshot-manager.js', () => ({ evictSnapshots: (...args: unknown[]) => mockEvictSnapshots(...args), + invalidateSnapshot: (...args: unknown[]) => mockInvalidateSnapshot(...args), })); // --------------------------------------------------------------------------- @@ -59,6 +62,7 @@ vi.mock('../../../src/router/snapshot-manager.js', () => ({ // --------------------------------------------------------------------------- import { + invalidateAndRemoveSnapshot, runSnapshotCleanup, startSnapshotCleanup, stopSnapshotCleanup, @@ -236,4 +240,59 @@ describe('snapshot-cleanup', () => { expect(mockImageRemove).toHaveBeenCalledTimes(3); }); }); + + // ------------------------------------------------------------------------- + // invalidateAndRemoveSnapshot — eager rmi when a PR merges (no 5-min wait) + // ------------------------------------------------------------------------- + + describe('invalidateAndRemoveSnapshot', () => { + beforeEach(() => { + mockInvalidateSnapshot.mockReset(); + }); + + it('calls docker.getImage().remove() when invalidate returned metadata', async () => { + // Live regression: prior to this fix, pr-merged.ts called + // `invalidateSnapshot()` (registry-only) and never called + // `docker rmi`. The image was orphaned in the Docker daemon — + // the periodic 5-min cleanup loop only iterates registry entries + // and would never see it. The unregistered image leaked + // permanently, contributing to today's disk-fill incident. + const metadata = makeMetadata({ imageName: 'cascade-snapshot-proj-1-card-abc:latest' }); + mockInvalidateSnapshot.mockReturnValue(metadata); + + await invalidateAndRemoveSnapshot('proj-1', 'card-abc'); + + expect(mockInvalidateSnapshot).toHaveBeenCalledWith('proj-1', 'card-abc'); + expect(mockDockerGetImage).toHaveBeenCalledWith('cascade-snapshot-proj-1-card-abc:latest'); + expect(mockImageRemove).toHaveBeenCalledOnce(); + }); + + it('is a no-op when no snapshot was registered (no docker call)', async () => { + mockInvalidateSnapshot.mockReturnValue(undefined); + + await invalidateAndRemoveSnapshot('proj-missing', 'card-missing'); + + expect(mockInvalidateSnapshot).toHaveBeenCalledWith('proj-missing', 'card-missing'); + expect(mockDockerGetImage).not.toHaveBeenCalled(); + expect(mockImageRemove).not.toHaveBeenCalled(); + }); + + it('swallows 404 (image already gone) without warning or sentry capture', async () => { + mockInvalidateSnapshot.mockReturnValue(makeMetadata()); + mockImageRemove.mockRejectedValueOnce(makeDockerError(404, 'no such image')); + + await expect(invalidateAndRemoveSnapshot('proj-1', 'card-1')).resolves.toBeUndefined(); + expect(mockLogger.warn).not.toHaveBeenCalled(); + expect(mockCaptureException).not.toHaveBeenCalled(); + }); + + it('swallows 409 (image in use) without warning or sentry capture', async () => { + mockInvalidateSnapshot.mockReturnValue(makeMetadata()); + mockImageRemove.mockRejectedValueOnce(makeDockerError(409, 'image in use')); + + await expect(invalidateAndRemoveSnapshot('proj-1', 'card-1')).resolves.toBeUndefined(); + expect(mockLogger.warn).not.toHaveBeenCalled(); + expect(mockCaptureException).not.toHaveBeenCalled(); + }); + }); }); diff --git a/tests/unit/router/snapshot-manager.test.ts b/tests/unit/router/snapshot-manager.test.ts index b0825ea7..bb548b01 100644 --- a/tests/unit/router/snapshot-manager.test.ts +++ b/tests/unit/router/snapshot-manager.test.ts @@ -203,6 +203,27 @@ describe('snapshot-manager', () => { expect(getSnapshot('proj-1', 'card-2')).toBeDefined(); expect(getSnapshotCount()).toBe(1); }); + + // Spec: invalidate-on-PR-merge should also `docker rmi` the image. The + // caller (pr-merged trigger) owns the rmi via the snapshot-cleanup + // helper; for that helper to know which image to remove, invalidate + // must return the metadata of the entry it removed (or undefined + // when nothing was registered). + it('returns the removed metadata when an entry was registered', () => { + registerSnapshot('proj-1', 'card-abc', 'cascade-snapshot-proj-1-card-abc:latest'); + + const removed = invalidateSnapshot('proj-1', 'card-abc'); + + expect(removed).toBeDefined(); + expect(removed?.imageName).toBe('cascade-snapshot-proj-1-card-abc:latest'); + expect(removed?.projectId).toBe('proj-1'); + expect(removed?.workItemId).toBe('card-abc'); + }); + + it('returns undefined when no entry was registered', () => { + const removed = invalidateSnapshot('proj-missing', 'card-missing'); + expect(removed).toBeUndefined(); + }); }); // ------------------------------------------------------------------------- diff --git a/tests/unit/triggers/pr-merged.test.ts b/tests/unit/triggers/pr-merged.test.ts index b11d3904..2bf61481 100644 --- a/tests/unit/triggers/pr-merged.test.ts +++ b/tests/unit/triggers/pr-merged.test.ts @@ -52,10 +52,14 @@ vi.mock('../../../src/db/repositories/prWorkItemsRepository.js', () => ({ lookupWorkItemForPR: vi.fn(), })); -// Mock the snapshot manager so we can verify invalidation calls -const mockInvalidateSnapshot = vi.fn(); -vi.mock('../../../src/router/snapshot-manager.js', () => ({ - invalidateSnapshot: (...args: unknown[]) => mockInvalidateSnapshot(...args), +// Mock the snapshot-cleanup helper so we can verify invalidation+rmi calls. +// pr-merged calls `invalidateAndRemoveSnapshot` (the helper that BOTH clears +// the in-memory registry AND `docker rmi`s the image), not the registry-only +// `invalidateSnapshot`. The latter alone leaks the docker image — see the +// 2026-04-29 disk-fill incident. +const mockInvalidateAndRemoveSnapshot = vi.fn().mockResolvedValue(undefined); +vi.mock('../../../src/router/snapshot-cleanup.js', () => ({ + invalidateAndRemoveSnapshot: (...args: unknown[]) => mockInvalidateAndRemoveSnapshot(...args), })); // Register PM integrations in the registry via the canonical bootstrap path @@ -91,7 +95,8 @@ describe('PRMergedTrigger', () => { beforeEach(() => { vi.mocked(lookupWorkItemForPR).mockResolvedValue('abc123'); vi.mocked(checkTriggerEnabled).mockResolvedValue(true); - mockInvalidateSnapshot.mockClear(); + mockInvalidateAndRemoveSnapshot.mockReset(); + mockInvalidateAndRemoveSnapshot.mockResolvedValue(undefined); }); describe('matches', () => { @@ -703,7 +708,7 @@ describe('PRMergedTrigger', () => { await trigger.handle(ctx); // Snapshot should be invalidated for the project+workItem pair - expect(mockInvalidateSnapshot).toHaveBeenCalledWith(mockProject.id, 'abc123'); + expect(mockInvalidateAndRemoveSnapshot).toHaveBeenCalledWith(mockProject.id, 'abc123'); }); it('does not invalidate snapshot when PR is not merged', async () => { @@ -737,7 +742,7 @@ describe('PRMergedTrigger', () => { await trigger.handle(ctx); // No invalidation when PR is not merged - expect(mockInvalidateSnapshot).not.toHaveBeenCalled(); + expect(mockInvalidateAndRemoveSnapshot).not.toHaveBeenCalled(); }); it('does not invalidate snapshot when no work item is linked', async () => { @@ -772,7 +777,7 @@ describe('PRMergedTrigger', () => { await trigger.handle(ctx); // No invalidation when there's no linked work item - expect(mockInvalidateSnapshot).not.toHaveBeenCalled(); + expect(mockInvalidateAndRemoveSnapshot).not.toHaveBeenCalled(); }); }); });