Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions src/router/snapshot-cleanup.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/**
* Periodic snapshot eviction for CASCADE worker snapshots.
*
* Runs alongside the existing orphan cleanup loop (orphan-cleanup.ts) and
* uses the same start/stop lifecycle pattern. On each tick it calls
* evictSnapshots() to enforce the per-project TTL and global max-count /
* max-size budget limits.
*
* This module owns only the timer — no Docker API usage. The actual eviction
* logic lives in snapshot-manager.ts.
*/

import { captureException } from '../sentry.js';
import { logger } from '../utils/logging.js';
import { routerConfig } from './config.js';
import { evictSnapshots } from './snapshot-manager.js';

const SNAPSHOT_CLEANUP_INTERVAL_MS = 5 * 60 * 1000; // 5 minutes

/** Periodic snapshot cleanup timer */
let snapshotCleanupTimer: NodeJS.Timeout | null = null;

/**
* Start periodic snapshot eviction.
* Runs every 5 minutes and enforces TTL plus global max-count / max-size limits.
* No-op if already started.
*/
export function startSnapshotCleanup(): void {
if (snapshotCleanupTimer) {
logger.warn('[SnapshotCleanup] Snapshot cleanup already started');
return;
}

snapshotCleanupTimer = setInterval(() => {
runSnapshotCleanup().catch((err) => {
logger.error('[SnapshotCleanup] Error during snapshot cleanup scan:', err);
captureException(err, {
tags: { source: 'snapshot_cleanup_scan' },
level: 'error',
});
});
}, SNAPSHOT_CLEANUP_INTERVAL_MS);

logger.info('[SnapshotCleanup] Started snapshot cleanup scan (every 5 minutes)');
}

/**
* Stop periodic snapshot eviction.
* Clears the scan timer. No-op if not started.
*/
export function stopSnapshotCleanup(): void {
if (snapshotCleanupTimer) {
clearInterval(snapshotCleanupTimer);
snapshotCleanupTimer = null;
logger.info('[SnapshotCleanup] Stopped snapshot cleanup scan');
}
}

/**
* Run a single snapshot eviction sweep using the global config limits.
* Exposed for testing and for manual invocation.
* @internal Exported for testing
*/
export async function runSnapshotCleanup(): Promise<void> {
const evicted = evictSnapshots(
routerConfig.snapshotDefaultTtlMs,
routerConfig.snapshotMaxCount,
routerConfig.snapshotMaxSizeBytes,
);

if (evicted > 0) {
logger.info('[SnapshotCleanup] Snapshot cleanup scan removed entries:', { evicted });
}
}
104 changes: 104 additions & 0 deletions src/router/snapshot-manager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,14 @@
*
* This module is pure state management — no Docker API usage.
* Docker commit operations are triggered from container-manager.ts.
*
* Eviction strategy:
* - TTL eviction: snapshots older than ttlMs are removed on access (eager)
* or during periodic cleanup scans.
* - Max-count eviction: when the registry exceeds snapshotMaxCount, the oldest
* entries are removed first (LRU by createdAt).
* - Max-size eviction: when the total estimated image size exceeds
* snapshotMaxSizeBytes, the oldest entries are removed first.
*/

import { logger } from '../utils/logging.js';
Expand All @@ -22,6 +30,8 @@ export interface SnapshotMetadata {
workItemId: string;
/** Wall-clock timestamp when the snapshot was created */
createdAt: Date;
/** Estimated size of the snapshot image in bytes (optional, used for budget eviction) */
imageSizeBytes?: number;
}

/** In-memory snapshot registry keyed by `${projectId}:${workItemId}` */
Expand All @@ -39,13 +49,15 @@ export function registerSnapshot(
projectId: string,
workItemId: string,
imageName: string,
imageSizeBytes?: number,
): SnapshotMetadata {
const key = snapshotKey(projectId, workItemId);
const metadata: SnapshotMetadata = {
imageName,
projectId,
workItemId,
createdAt: new Date(),
imageSizeBytes,
};
snapshots.set(key, metadata);
logger.info('[SnapshotManager] Snapshot registered:', {
Expand Down Expand Up @@ -113,6 +125,98 @@ export function getSnapshotCount(): number {
return snapshots.size;
}

/**
* Evict expired and over-budget snapshots from the in-memory registry.
*
* Eviction order:
* 1. TTL: remove all entries older than snapshotDefaultTtlMs.
* 2. Max-count: if still over-budget, remove oldest entries until at or below
* snapshotMaxCount.
* 3. Max-size: if still over-budget, remove oldest entries until estimated
* total size is at or below snapshotMaxSizeBytes.
*
* Returns the number of entries removed.
*
* This function operates only on the in-memory metadata registry. It does NOT
* remove Docker images — callers are responsible for any Docker cleanup.
*/
export function evictSnapshots(
ttlMs: number = routerConfig.snapshotDefaultTtlMs,
maxCount: number = routerConfig.snapshotMaxCount,
maxSizeBytes: number = routerConfig.snapshotMaxSizeBytes,
): number {
let evicted = 0;
const now = Date.now();

// Phase 1: TTL eviction — remove all expired entries
for (const [key, metadata] of snapshots) {
const ageMs = now - metadata.createdAt.getTime();
if (ageMs > ttlMs) {
snapshots.delete(key);
evicted++;
logger.info('[SnapshotManager] Evicted expired snapshot:', {
projectId: metadata.projectId,
workItemId: metadata.workItemId,
ageMs,
ttlMs,
});
}
}

// Phase 2: Max-count eviction — remove oldest entries if over budget
if (snapshots.size > maxCount) {
const sorted = Array.from(snapshots.entries()).sort(
([, a], [, b]) => a.createdAt.getTime() - b.createdAt.getTime(),
);
const toRemove = snapshots.size - maxCount;
for (let i = 0; i < toRemove; i++) {
const [key, metadata] = sorted[i];
snapshots.delete(key);
evicted++;
logger.info('[SnapshotManager] Evicted snapshot (over max-count):', {
projectId: metadata.projectId,
workItemId: metadata.workItemId,
snapshotCount: snapshots.size + (toRemove - i),
maxCount,
});
}
}

// Phase 3: Max-size eviction — remove oldest entries if over total size budget
const totalSizeBytes = Array.from(snapshots.values()).reduce(
(sum, m) => sum + (m.imageSizeBytes ?? 0),
0,
);
if (totalSizeBytes > maxSizeBytes) {
const sorted = Array.from(snapshots.entries()).sort(
([, a], [, b]) => a.createdAt.getTime() - b.createdAt.getTime(),
);
let runningSize = totalSizeBytes;
for (const [key, metadata] of sorted) {
if (runningSize <= maxSizeBytes) break;
snapshots.delete(key);
evicted++;
runningSize -= metadata.imageSizeBytes ?? 0;
logger.info('[SnapshotManager] Evicted snapshot (over max-size):', {
projectId: metadata.projectId,
workItemId: metadata.workItemId,
imageSizeBytes: metadata.imageSizeBytes,
runningSize,
maxSizeBytes,
});
}
}

if (evicted > 0) {
logger.info('[SnapshotManager] Eviction sweep complete:', {
evicted,
remaining: snapshots.size,
});
}

return evicted;
}

/**
* Clear all snapshot metadata.
* Intended for use in tests and clean-shutdown scenarios.
Expand Down
7 changes: 6 additions & 1 deletion src/router/worker-manager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import {
stopOrphanCleanup,
} from './container-manager.js';
import type { CascadeJob } from './queue.js';
import { startSnapshotCleanup, stopSnapshotCleanup } from './snapshot-cleanup.js';

// Re-export container-manager public API so existing callers are unaffected.
export { getActiveWorkerCount, getActiveWorkers, startOrphanCleanup, stopOrphanCleanup };
Expand Down Expand Up @@ -78,13 +79,17 @@ export function startWorkerProcessor(): void {
// Start periodic orphan cleanup scan
startOrphanCleanup();

// Start periodic snapshot eviction alongside orphan cleanup
startSnapshotCleanup();

logger.info('[WorkerManager] Started with max', routerConfig.maxWorkers, 'concurrent workers');
}

// Graceful shutdown — detach from workers, let them finish independently
export async function stopWorkerProcessor(): Promise<void> {
// Stop orphan cleanup first
// Stop orphan cleanup and snapshot cleanup first
stopOrphanCleanup();
stopSnapshotCleanup();

if (dashboardWorker) {
await dashboardWorker.close();
Expand Down
6 changes: 6 additions & 0 deletions src/triggers/github/pr-merged.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { githubClient } from '../../github/client.js';
import { getPMProvider } from '../../pm/context.js';
import { resolveProjectPMConfig } from '../../pm/lifecycle.js';
import { invalidateSnapshot } from '../../router/snapshot-manager.js';
import type { TriggerContext, TriggerHandler, TriggerResult } from '../../types/index.js';
import { logger } from '../../utils/logging.js';
import { parseRepoFullName } from '../../utils/repo.js';
Expand Down Expand Up @@ -46,6 +47,11 @@ export class PRMergedTrigger implements TriggerHandler {
return null;
}

// Fire-and-forget: invalidate any stale snapshot for this work item now that
// the PR is merged. The snapshot was built for a specific state of the work
// item and is no longer valid after the work is done.
invalidateSnapshot(ctx.project.id, workItemId);

const pmConfig = resolveProjectPMConfig(ctx.project);
const mergedStatus = pmConfig.statuses.merged;

Expand Down
11 changes: 11 additions & 0 deletions src/triggers/trello/status-changed.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { getTrelloConfig } from '../../pm/config.js';
import { invalidateSnapshot } from '../../router/snapshot-manager.js';
import { logger } from '../../utils/logging.js';
import { checkTriggerEnabled } from '../shared/trigger-check.js';
import type { TriggerContext, TriggerHandler, TriggerResult } from '../types.js';
Expand All @@ -13,6 +14,8 @@ interface StatusChangedConfig {
description: string;
listKey: 'splitting' | 'planning' | 'todo' | 'backlog' | 'merged';
agentType: 'splitting' | 'planning' | 'implementation' | 'backlog-manager';
/** When true, invalidate any snapshot for the card when it reaches this status */
invalidateSnapshotOnMove?: boolean;
}

function createStatusChangedTrigger(config: StatusChangedConfig): TriggerHandler {
Expand Down Expand Up @@ -68,6 +71,13 @@ function createStatusChangedTrigger(config: StatusChangedConfig): TriggerHandler
const workItemUrl = cardShortLink ? `https://trello.com/c/${cardShortLink}` : undefined;
const workItemTitle = cardName ?? undefined;

// Fire-and-forget: invalidate any stale snapshot for this work item when
// the card reaches a terminal status (e.g. merged). The snapshot was built
// for an earlier state and is no longer useful.
if (config.invalidateSnapshotOnMove) {
invalidateSnapshot(ctx.project.id, cardId);
}

return {
agentType: config.agentType,
agentInput: {
Expand Down Expand Up @@ -122,4 +132,5 @@ export const TrelloStatusChangedMergedTrigger = createStatusChangedTrigger({
'Re-triggers backlog-manager when any card is moved to MERGED, so manually resolved dependencies unblock the backlog',
listKey: 'merged',
agentType: 'backlog-manager',
invalidateSnapshotOnMove: true,
});
Loading
Loading