From 21d23284745d2cb0f8d6830911a29f3286e1d85e Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Fri, 1 May 2026 01:52:09 -0500 Subject: [PATCH] feat(db): support purging specific run attempts via run-overrides --- packages/db/src/apply-overrides.ts | 73 +++++++++++++++++------ packages/db/src/etl/run-overrides.test.ts | 69 ++++++++++++++++++++- packages/db/src/etl/run-overrides.ts | 20 ++++++- packages/db/src/etl/workflow-run.ts | 6 +- packages/db/src/ingest-ci-run.ts | 10 ++-- 5 files changed, 152 insertions(+), 26 deletions(-) diff --git a/packages/db/src/apply-overrides.ts b/packages/db/src/apply-overrides.ts index 028e2f7c..e84078ff 100644 --- a/packages/db/src/apply-overrides.ts +++ b/packages/db/src/apply-overrides.ts @@ -2,6 +2,7 @@ * Enforce all run-overrides.ts entries against the DB: * 1. Patch conclusions for CONCLUSION_OVERRIDES * 2. Purge runs listed in PURGED_RUNS + * 3. Purge specific attempts listed in PURGED_RUN_ATTEMPTS * * Previews changes (read-only), then confirms before writing. * @@ -12,7 +13,7 @@ import { confirm, hasNoSslFlag, hasYesFlag } from './cli-utils.js'; import { type Sql, createAdminSql, refreshLatestBenchmarks } from './etl/db-utils.js'; -import { CONCLUSION_OVERRIDES, PURGED_RUNS } from './etl/run-overrides.js'; +import { CONCLUSION_OVERRIDES, PURGED_RUN_ATTEMPTS, PURGED_RUNS } from './etl/run-overrides.js'; const sql = createAdminSql({ noSsl: hasNoSslFlag(), @@ -75,21 +76,40 @@ interface PurgeTarget { changelogs: number; } -/** Preview a run: print metadata and row counts. Returns null if not in DB. */ -async function previewPurge(githubRunId: number): Promise { - const runs = await sql` - SELECT id, run_attempt, date::text AS date, name, conclusion - FROM workflow_runs - WHERE github_run_id = ${githubRunId} - ORDER BY run_attempt - `; +/** + * Preview a run: print metadata and row counts. Returns null if not in DB. + * If `attempts` is provided, only those `run_attempt` values are targeted; + * otherwise every attempt for the run is included. + */ +async function previewPurge( + githubRunId: number, + attempts?: ReadonlySet, +): Promise { + const runs = attempts + ? await sql` + SELECT id, run_attempt, date::text AS date, name, conclusion + FROM workflow_runs + WHERE github_run_id = ${githubRunId} + AND run_attempt = ANY(${[...attempts]}) + ORDER BY run_attempt + ` + : await sql` + SELECT id, run_attempt, date::text AS date, name, conclusion + FROM workflow_runs + WHERE github_run_id = ${githubRunId} + ORDER BY run_attempt + `; if (runs.length === 0) { - console.log(` ${githubRunId} — not in DB, skipping.`); + const suffix = attempts ? ` attempts ${[...attempts].toSorted().join(',')}` : ''; + console.log(` ${githubRunId}${suffix} — not in DB, skipping.`); return null; } const wrIds = runs.map((r) => r.id as number); - console.log(` ${githubRunId}`); + const header = attempts + ? `${githubRunId} (attempts ${runs.map((r) => r.run_attempt).join(',')})` + : `${githubRunId}`; + console.log(` ${header}`); for (const r of runs) { const shortName = r.name.split('\n')[0].slice(0, 80); console.log( @@ -120,8 +140,12 @@ async function previewPurge(githubRunId: number): Promise { }; } -/** Delete all data for a run within a transaction. */ -async function purge(githubRunId: number, wrIds: number[]): Promise { +/** + * Delete data for the given workflow_run rows (one or more attempts) in a transaction. + * `wrIds` is the set of `workflow_runs.id` values to remove; sibling attempts of the + * same `github_run_id` that aren't in `wrIds` are left intact. + */ +async function purge(wrIds: number[]): Promise { // postgres TransactionSql Omit drops the call signature — cast to Sql type await sql.begin(async (_tx) => { const tx = _tx as unknown as Sql; @@ -192,8 +216,9 @@ async function purge(githubRunId: number, wrIds: number[]): Promise { `; } - // Parent last - await tx`DELETE FROM workflow_runs WHERE github_run_id = ${githubRunId}`; + // Parent last (target the specific workflow_runs rows so partial purges + // leave sibling attempts of the same github_run_id intact) + await tx`DELETE FROM workflow_runs WHERE id = ANY(${wrIds})`; }); console.log(` deleted.`); @@ -227,6 +252,20 @@ async function main(): Promise { if (result) found.push(result); } } + + const attemptTargets = [...PURGED_RUN_ATTEMPTS.entries()]; + if (attemptTargets.length > 0) { + console.log(`\n Purge attempt targets (${attemptTargets.length}):`); + for (const [id, attempts] of attemptTargets) { + // Skip if the whole run is already covered by PURGED_RUNS + if (PURGED_RUNS.has(id)) { + console.log(` ${id} — already in PURGED_RUNS, skipping per-attempt purge.`); + continue; + } + const result = await previewPurge(id, attempts); + if (result) found.push(result); + } + } if (found.length > 0) hasWork = true; if (!hasWork) { @@ -251,8 +290,8 @@ async function main(): Promise { if (found.length > 0) { console.log('\n Purging runs...'); - for (const { githubRunId, wrIds } of found) { - await purge(githubRunId, wrIds); + for (const { wrIds } of found) { + await purge(wrIds); } } diff --git a/packages/db/src/etl/run-overrides.test.ts b/packages/db/src/etl/run-overrides.test.ts index 9130d063..a0cc9852 100644 --- a/packages/db/src/etl/run-overrides.test.ts +++ b/packages/db/src/etl/run-overrides.test.ts @@ -1,5 +1,10 @@ import { describe, it, expect } from 'vitest'; -import { CONCLUSION_OVERRIDES, PURGED_RUNS } from './run-overrides'; +import { + CONCLUSION_OVERRIDES, + PURGED_RUN_ATTEMPTS, + PURGED_RUNS, + isRunAttemptPurged, +} from './run-overrides'; describe('CONCLUSION_OVERRIDES', () => { it('all run IDs are positive integers', () => { @@ -34,3 +39,65 @@ describe('PURGED_RUNS', () => { } }); }); + +describe('PURGED_RUN_ATTEMPTS', () => { + it('all run IDs and attempt numbers are positive integers', () => { + for (const [runId, attempts] of PURGED_RUN_ATTEMPTS) { + expect(runId).toBeGreaterThan(0); + expect(Number.isInteger(runId)).toBe(true); + expect(attempts.size).toBeGreaterThan(0); + for (const attempt of attempts) { + expect(attempt).toBeGreaterThan(0); + expect(Number.isInteger(attempt)).toBe(true); + } + } + }); + + it('does not overlap with PURGED_RUNS (use one or the other)', () => { + for (const runId of PURGED_RUN_ATTEMPTS.keys()) { + expect( + PURGED_RUNS.has(runId), + `run ${runId} appears in both PURGED_RUNS and PURGED_RUN_ATTEMPTS`, + ).toBe(false); + } + }); + + it('does not overlap with CONCLUSION_OVERRIDES', () => { + for (const runId of PURGED_RUN_ATTEMPTS.keys()) { + expect( + CONCLUSION_OVERRIDES.has(runId), + `run ${runId} is in both PURGED_RUN_ATTEMPTS and CONCLUSION_OVERRIDES`, + ).toBe(false); + } + }); +}); + +describe('isRunAttemptPurged', () => { + it('returns true for runs in PURGED_RUNS regardless of attempt', () => { + const [first] = PURGED_RUNS; + if (first === undefined) return; + expect(isRunAttemptPurged(first)).toBe(true); + expect(isRunAttemptPurged(first, 1)).toBe(true); + expect(isRunAttemptPurged(first, 99)).toBe(true); + }); + + it('returns true only for the specific attempts listed in PURGED_RUN_ATTEMPTS', () => { + for (const [runId, attempts] of PURGED_RUN_ATTEMPTS) { + for (const attempt of attempts) { + expect(isRunAttemptPurged(runId, attempt)).toBe(true); + } + // An attempt not in the set should not be purged (assuming the run isn't in PURGED_RUNS) + const unlistedAttempt = Math.max(...attempts) + 1; + if (!attempts.has(unlistedAttempt)) { + expect(isRunAttemptPurged(runId, unlistedAttempt)).toBe(false); + } + // Without an attempt, only whole-run purges count → false here + expect(isRunAttemptPurged(runId)).toBe(false); + } + }); + + it('returns false for runs that are not purged', () => { + expect(isRunAttemptPurged(1, 1)).toBe(false); + expect(isRunAttemptPurged(1)).toBe(false); + }); +}); diff --git a/packages/db/src/etl/run-overrides.ts b/packages/db/src/etl/run-overrides.ts index 3b0e884c..7c6b7ee6 100644 --- a/packages/db/src/etl/run-overrides.ts +++ b/packages/db/src/etl/run-overrides.ts @@ -1,7 +1,7 @@ /** * Per-run overrides and special cases for the ingest pipeline. * - * Both are applied at ingest time. Run `pnpm db:apply-overrides` to patch existing DB rows. + * All are applied at ingest time. Run `pnpm db:apply-overrides` to patch existing DB rows. * * CONCLUSION_OVERRIDES — force the conclusion for a run (e.g. 'success' when * the benchmark ran fine but CI failed on a non-benchmark step). @@ -9,6 +9,10 @@ * PURGED_RUNS — runs to skip on ingest and delete from the DB, * e.g. typically due to experimental runs or features which generate lots of broken data. * + * PURGED_RUN_ATTEMPTS — purge only specific attempts of a run, leaving the others intact. + * Use this when a single attempt produced bad data but a later attempt is expected to succeed + * (or has already succeeded), so we can't nuke the entire run. + * * Note: GitHub deletes old workflow runs over time so these overrides may not be applicable forever, * but we should keep them around for historical reference. You can find these on github (if available) by filling * in the run id into the following link: https://github.com/SemiAnalysisAI/InferenceX/actions/runs/{run_id_here} @@ -39,3 +43,17 @@ export const PURGED_RUNS: ReadonlySet = new Set([ 24959542295, // 2026-04-25 | Reason: MTP without chat template leads to supernatural AR 24960716250, // 2026-04-25 | Reason: incorrect usage of run sweep and sweep failed, fixed in subsequent PR ]); + +export const PURGED_RUN_ATTEMPTS: ReadonlyMap> = new Map([ + [25199291771, new Set([1])], // 2026-05-01 | dsv4 GB200 dynamo-vllm MTP2 | Reason: only 2 of 6 conc=1 points uploaded. re-run pending +]); + +/** + * True when the (run, attempt) pair should be skipped on ingest. Pass `runAttempt` + * to honor PURGED_RUN_ATTEMPTS; omit it to check whole-run purges only. + */ +export function isRunAttemptPurged(githubRunId: number, runAttempt?: number): boolean { + if (PURGED_RUNS.has(githubRunId)) return true; + if (runAttempt === undefined) return false; + return PURGED_RUN_ATTEMPTS.get(githubRunId)?.has(runAttempt) ?? false; +} diff --git a/packages/db/src/etl/workflow-run.ts b/packages/db/src/etl/workflow-run.ts index a6bd9fd2..4097a3c5 100644 --- a/packages/db/src/etl/workflow-run.ts +++ b/packages/db/src/etl/workflow-run.ts @@ -8,7 +8,7 @@ import type postgres from 'postgres'; import { GITHUB_API_BASE, GITHUB_REPOS } from '@semianalysisai/inferencex-constants'; -import { CONCLUSION_OVERRIDES, PURGED_RUNS } from './run-overrides.js'; +import { CONCLUSION_OVERRIDES, isRunAttemptPurged } from './run-overrides.js'; type Sql = ReturnType; @@ -148,9 +148,9 @@ export function createWorkflowRunServices(sql: Sql, githubToken?: string) { runStartedAt?: string | null; ghInfo?: GithubRunInfo | null; }): Promise { - if (PURGED_RUNS.has(params.githubRunId)) return null; - const attempt = params.runAttempt ?? params.ghInfo?.runAttempt ?? 0; + if (isRunAttemptPurged(params.githubRunId, attempt)) return null; + const cacheKey = `${params.githubRunId}:${attempt}`; if (workflowRunCache.has(cacheKey)) return workflowRunCache.get(cacheKey)!; diff --git a/packages/db/src/ingest-ci-run.ts b/packages/db/src/ingest-ci-run.ts index d122b136..69923770 100644 --- a/packages/db/src/ingest-ci-run.ts +++ b/packages/db/src/ingest-ci-run.ts @@ -28,7 +28,7 @@ import { GPU_KEYS } from '@semianalysisai/inferencex-constants'; import { hasNoSslFlag } from './cli-utils'; import { createAdminSql, refreshLatestBenchmarks } from './etl/db-utils'; -import { PURGED_RUNS } from './etl/run-overrides'; +import { isRunAttemptPurged } from './etl/run-overrides'; import { createSkipTracker } from './etl/skip-tracker'; import { createConfigCache } from './etl/config-cache'; import { createWorkflowRunServices } from './etl/workflow-run'; @@ -158,8 +158,8 @@ if (!process.env.DATABASE_WRITE_URL || !process.env.GITHUB_TOKEN) { } const runIdNum = parseInt(runIdStr, 10); -if (PURGED_RUNS.has(runIdNum)) { - console.log(` Run ${runIdStr} is in PURGED_RUNS — skipping.`); +if (isRunAttemptPurged(runIdNum, runAttemptNum)) { + console.log(` Run ${runIdStr} attempt ${runAttemptNum} is purged via run-overrides — skipping.`); process.exit(0); } @@ -243,7 +243,9 @@ async function main(): Promise { ghInfo, }); if (workflowRunId === null) { - console.log(` Run ${runId} is in PURGED_RUNS — skipping ingest.`); + console.log( + ` Run ${runId} attempt ${runAttemptNum} is purged via run-overrides — skipping ingest.`, + ); return; } console.log(` Workflow run DB id: ${workflowRunId}`);