Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 22 additions & 2 deletions src/router/snapshot-cleanup.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ import Docker from 'dockerode';
import { captureException } from '../sentry.js';
import { logger } from '../utils/logging.js';
import { routerConfig } from './config.js';
import { evictSnapshots, type SnapshotMetadata } from './snapshot-manager.js';
import { evictSnapshots, invalidateSnapshot, type SnapshotMetadata } from './snapshot-manager.js';

const SNAPSHOT_CLEANUP_INTERVAL_MS = 5 * 60 * 1000; // 5 minutes

Expand Down Expand Up @@ -77,7 +77,7 @@ function dockerStatusCode(err: unknown): number | undefined {
* a running container is preserved (Docker returns 409). 404 means the image
* has already been removed by some other path. Both are harmless and silent.
*/
async function removeSnapshotImage(metadata: SnapshotMetadata): Promise<void> {
export async function removeSnapshotImage(metadata: SnapshotMetadata): Promise<void> {
try {
await docker.getImage(metadata.imageName).remove({ force: false });
logger.info('[SnapshotCleanup] Removed snapshot image:', {
Expand Down Expand Up @@ -130,3 +130,23 @@ export async function runSnapshotCleanup(): Promise<void> {

logger.info('[SnapshotCleanup] Cleanup pass complete:', { count: evicted.length });
}

/**
* Eagerly invalidate a snapshot AND remove its Docker image. Called on PR
* merge: the snapshot was built for a specific work-item state and is no
* longer valid. Without this, `invalidateSnapshot` alone clears the registry
* entry but leaves the image in Docker storage — and the periodic 5-min
* cleanup loop iterates registry entries only, so the orphaned image is
* leaked permanently. That leak is what filled prod disk to 100% on
* 2026-04-29.
*
* Safe to call when no snapshot exists for the pair (no-op).
*/
export async function invalidateAndRemoveSnapshot(
projectId: string,
workItemId: string,
): Promise<void> {
const removed = invalidateSnapshot(projectId, workItemId);
if (!removed) return;
await removeSnapshotImage(removed);
}
18 changes: 14 additions & 4 deletions src/router/snapshot-manager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -155,17 +155,27 @@ export function getSnapshot(

/**
* Invalidate (remove) snapshot metadata for a project+workItem pair.
* Safe to call even if no snapshot exists.
* Returns the removed metadata so the caller can `docker rmi` the underlying
* image. Returns undefined when no entry was registered. Removing the in-memory
* entry without removing the image would orphan the image (the periodic cleanup
* loop iterates registry entries only) — callers MUST act on the returned
* metadata. See snapshot-cleanup.invalidateAndRemoveSnapshot for the canonical
* caller.
*/
export function invalidateSnapshot(projectId: string, workItemId: string): void {
export function invalidateSnapshot(
projectId: string,
workItemId: string,
): SnapshotMetadata | undefined {
const key = snapshotKey(projectId, workItemId);
const hadEntry = snapshots.delete(key);
if (hadEntry) {
const removed = snapshots.get(key);
snapshots.delete(key);
if (removed) {
logger.info('[SnapshotManager] Snapshot invalidated:', {
projectId,
workItemId,
});
}
return removed;
}

/**
Expand Down
17 changes: 12 additions & 5 deletions src/triggers/github/pr-merged.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { githubClient } from '../../github/client.js';
import { getPMProvider } from '../../pm/context.js';
import { resolveProjectPMConfig } from '../../pm/lifecycle.js';
import { invalidateSnapshot } from '../../router/snapshot-manager.js';
import { invalidateAndRemoveSnapshot } from '../../router/snapshot-cleanup.js';
import type { TriggerContext, TriggerHandler, TriggerResult } from '../../types/index.js';
import { logger } from '../../utils/logging.js';
import { parseRepoFullName } from '../../utils/repo.js';
Expand Down Expand Up @@ -47,10 +47,17 @@ export class PRMergedTrigger implements TriggerHandler {
return null;
}

// Fire-and-forget: invalidate any stale snapshot for this work item now that
// the PR is merged. The snapshot was built for a specific state of the work
// item and is no longer valid after the work is done.
invalidateSnapshot(ctx.project.id, workItemId);
// Fire-and-forget: invalidate the registry entry AND `docker rmi` the
// underlying image. Registry-only invalidation orphans the image (the
// periodic cleanup loop iterates registry entries) — see 2026-04-29
// disk-fill incident.
void invalidateAndRemoveSnapshot(ctx.project.id, workItemId).catch((err: unknown) => {
logger.warn('Failed to invalidate+remove snapshot on PR merge', {
projectId: ctx.project.id,
workItemId,
error: String(err),
});
});

const pmConfig = resolveProjectPMConfig(ctx.project);
const mergedStatus = pmConfig.statuses.merged;
Expand Down
59 changes: 59 additions & 0 deletions tests/unit/router/snapshot-cleanup.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,15 +50,19 @@ vi.mock('../../../src/router/config.js', () => ({
},
}));

const mockInvalidateSnapshot = vi.fn();

vi.mock('../../../src/router/snapshot-manager.js', () => ({
evictSnapshots: (...args: unknown[]) => mockEvictSnapshots(...args),
invalidateSnapshot: (...args: unknown[]) => mockInvalidateSnapshot(...args),
}));

// ---------------------------------------------------------------------------
// Imports (after mocks)
// ---------------------------------------------------------------------------

import {
invalidateAndRemoveSnapshot,
runSnapshotCleanup,
startSnapshotCleanup,
stopSnapshotCleanup,
Expand Down Expand Up @@ -236,4 +240,59 @@ describe('snapshot-cleanup', () => {
expect(mockImageRemove).toHaveBeenCalledTimes(3);
});
});

// -------------------------------------------------------------------------
// invalidateAndRemoveSnapshot — eager rmi when a PR merges (no 5-min wait)
// -------------------------------------------------------------------------

describe('invalidateAndRemoveSnapshot', () => {
beforeEach(() => {
mockInvalidateSnapshot.mockReset();
});

it('calls docker.getImage().remove() when invalidate returned metadata', async () => {
// Live regression: prior to this fix, pr-merged.ts called
// `invalidateSnapshot()` (registry-only) and never called
// `docker rmi`. The image was orphaned in the Docker daemon —
// the periodic 5-min cleanup loop only iterates registry entries
// and would never see it. The unregistered image leaked
// permanently, contributing to today's disk-fill incident.
const metadata = makeMetadata({ imageName: 'cascade-snapshot-proj-1-card-abc:latest' });
mockInvalidateSnapshot.mockReturnValue(metadata);

await invalidateAndRemoveSnapshot('proj-1', 'card-abc');

expect(mockInvalidateSnapshot).toHaveBeenCalledWith('proj-1', 'card-abc');
expect(mockDockerGetImage).toHaveBeenCalledWith('cascade-snapshot-proj-1-card-abc:latest');
expect(mockImageRemove).toHaveBeenCalledOnce();
});

it('is a no-op when no snapshot was registered (no docker call)', async () => {
mockInvalidateSnapshot.mockReturnValue(undefined);

await invalidateAndRemoveSnapshot('proj-missing', 'card-missing');

expect(mockInvalidateSnapshot).toHaveBeenCalledWith('proj-missing', 'card-missing');
expect(mockDockerGetImage).not.toHaveBeenCalled();
expect(mockImageRemove).not.toHaveBeenCalled();
});

it('swallows 404 (image already gone) without warning or sentry capture', async () => {
mockInvalidateSnapshot.mockReturnValue(makeMetadata());
mockImageRemove.mockRejectedValueOnce(makeDockerError(404, 'no such image'));

await expect(invalidateAndRemoveSnapshot('proj-1', 'card-1')).resolves.toBeUndefined();
expect(mockLogger.warn).not.toHaveBeenCalled();
expect(mockCaptureException).not.toHaveBeenCalled();
});

it('swallows 409 (image in use) without warning or sentry capture', async () => {
mockInvalidateSnapshot.mockReturnValue(makeMetadata());
mockImageRemove.mockRejectedValueOnce(makeDockerError(409, 'image in use'));

await expect(invalidateAndRemoveSnapshot('proj-1', 'card-1')).resolves.toBeUndefined();
expect(mockLogger.warn).not.toHaveBeenCalled();
expect(mockCaptureException).not.toHaveBeenCalled();
});
});
});
21 changes: 21 additions & 0 deletions tests/unit/router/snapshot-manager.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,27 @@ describe('snapshot-manager', () => {
expect(getSnapshot('proj-1', 'card-2')).toBeDefined();
expect(getSnapshotCount()).toBe(1);
});

// Spec: invalidate-on-PR-merge should also `docker rmi` the image. The
// caller (pr-merged trigger) owns the rmi via the snapshot-cleanup
// helper; for that helper to know which image to remove, invalidate
// must return the metadata of the entry it removed (or undefined
// when nothing was registered).
it('returns the removed metadata when an entry was registered', () => {
registerSnapshot('proj-1', 'card-abc', 'cascade-snapshot-proj-1-card-abc:latest');

const removed = invalidateSnapshot('proj-1', 'card-abc');

expect(removed).toBeDefined();
expect(removed?.imageName).toBe('cascade-snapshot-proj-1-card-abc:latest');
expect(removed?.projectId).toBe('proj-1');
expect(removed?.workItemId).toBe('card-abc');
});

it('returns undefined when no entry was registered', () => {
const removed = invalidateSnapshot('proj-missing', 'card-missing');
expect(removed).toBeUndefined();
});
});

// -------------------------------------------------------------------------
Expand Down
21 changes: 13 additions & 8 deletions tests/unit/triggers/pr-merged.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,14 @@ vi.mock('../../../src/db/repositories/prWorkItemsRepository.js', () => ({
lookupWorkItemForPR: vi.fn(),
}));

// Mock the snapshot manager so we can verify invalidation calls
const mockInvalidateSnapshot = vi.fn();
vi.mock('../../../src/router/snapshot-manager.js', () => ({
invalidateSnapshot: (...args: unknown[]) => mockInvalidateSnapshot(...args),
// Mock the snapshot-cleanup helper so we can verify invalidation+rmi calls.
// pr-merged calls `invalidateAndRemoveSnapshot` (the helper that BOTH clears
// the in-memory registry AND `docker rmi`s the image), not the registry-only
// `invalidateSnapshot`. The latter alone leaks the docker image — see the
// 2026-04-29 disk-fill incident.
const mockInvalidateAndRemoveSnapshot = vi.fn().mockResolvedValue(undefined);
vi.mock('../../../src/router/snapshot-cleanup.js', () => ({
invalidateAndRemoveSnapshot: (...args: unknown[]) => mockInvalidateAndRemoveSnapshot(...args),
}));

// Register PM integrations in the registry via the canonical bootstrap path
Expand Down Expand Up @@ -91,7 +95,8 @@ describe('PRMergedTrigger', () => {
beforeEach(() => {
vi.mocked(lookupWorkItemForPR).mockResolvedValue('abc123');
vi.mocked(checkTriggerEnabled).mockResolvedValue(true);
mockInvalidateSnapshot.mockClear();
mockInvalidateAndRemoveSnapshot.mockReset();
mockInvalidateAndRemoveSnapshot.mockResolvedValue(undefined);
});

describe('matches', () => {
Expand Down Expand Up @@ -703,7 +708,7 @@ describe('PRMergedTrigger', () => {
await trigger.handle(ctx);

// Snapshot should be invalidated for the project+workItem pair
expect(mockInvalidateSnapshot).toHaveBeenCalledWith(mockProject.id, 'abc123');
expect(mockInvalidateAndRemoveSnapshot).toHaveBeenCalledWith(mockProject.id, 'abc123');
});

it('does not invalidate snapshot when PR is not merged', async () => {
Expand Down Expand Up @@ -737,7 +742,7 @@ describe('PRMergedTrigger', () => {
await trigger.handle(ctx);

// No invalidation when PR is not merged
expect(mockInvalidateSnapshot).not.toHaveBeenCalled();
expect(mockInvalidateAndRemoveSnapshot).not.toHaveBeenCalled();
});

it('does not invalidate snapshot when no work item is linked', async () => {
Expand Down Expand Up @@ -772,7 +777,7 @@ describe('PRMergedTrigger', () => {
await trigger.handle(ctx);

// No invalidation when there's no linked work item
expect(mockInvalidateSnapshot).not.toHaveBeenCalled();
expect(mockInvalidateAndRemoveSnapshot).not.toHaveBeenCalled();
});
});
});
Loading