diff --git a/packages/app/cypress/support/mock-data.ts b/packages/app/cypress/support/mock-data.ts index e6720c0b..467cbd59 100644 --- a/packages/app/cypress/support/mock-data.ts +++ b/packages/app/cypress/support/mock-data.ts @@ -436,11 +436,14 @@ export function createMockUnofficialRunContext( return { isUnofficialRun: false, unofficialRunInfo: null, + unofficialRunInfos: [], + runIndexByUrl: {}, unofficialChartData: null, unofficialEvalRows: null, loading: false, error: null, clearUnofficialRun: namedStub('clearUnofficialRun'), + dismissRun: namedStub('dismissRun'), availableModelsAndSequences: [], getOverlayData: cy .stub() diff --git a/packages/app/src/app/api/unofficial-run/route.test.ts b/packages/app/src/app/api/unofficial-run/route.test.ts index 9077edbf..be87e016 100644 --- a/packages/app/src/app/api/unofficial-run/route.test.ts +++ b/packages/app/src/app/api/unofficial-run/route.test.ts @@ -206,7 +206,7 @@ describe('normalizeArtifactRows', () => { describe('normalizeEvalArtifactRows', () => { it('converts aggregate eval rows to EvalRow shape with synthetic config ids', () => { - const rows = normalizeEvalArtifactRows( + const { rows, maxConfigId } = normalizeEvalArtifactRows( [rawEvalRow({ task: 'gsm8k', conc: 16 }), rawEvalRow({ task: 'mmlu', conc: 32 })], '2026-03-01', '2026-03-01T12:34:56Z', @@ -229,10 +229,26 @@ describe('normalizeEvalArtifactRows', () => { }); expect(rows[1].config_id).toBe(1); expect(rows[1].metrics.em_strict).toBe(0.91); + expect(maxConfigId).toBe(1); + }); + + it('offsets config ids when configIdOffset is provided', () => { + const { rows, maxConfigId } = normalizeEvalArtifactRows( + [rawEvalRow({ task: 'gsm8k', conc: 16 }), rawEvalRow({ task: 'mmlu', hw: 'h200-nv' })], + '2026-03-01', + '2026-03-01T12:34:56Z', + 'https://github.com/SemiAnalysisAI/InferenceX/actions/runs/123', + 10, + ); + + expect(rows).toHaveLength(2); + // Two distinct configs (different hw) → local ids 1 and 2, plus offset = 11 and 12 + expect(rows.map((r) => r.config_id).toSorted()).toEqual([11, 12]); + expect(maxConfigId).toBe(12); }); it('skips eval rows with unmapped hardware/model/task', () => { - const rows = normalizeEvalArtifactRows( + const { rows } = normalizeEvalArtifactRows( [ rawEvalRow({ hw: 'unknown-gpu' }), rawEvalRow({ model_prefix: 'unknown', model: 'unknown/model' }), @@ -278,6 +294,13 @@ describe('GET /api/unofficial-run', () => { expect(res.status).toBe(400); }); + it('returns 400 when comma-separated list contains a non-numeric id', async () => { + const res = await GET(makeRequest('runId=123,abc,456')); + expect(res.status).toBe(400); + const body = await res.json(); + expect(body.error).toContain('comma-separated'); + }); + it('returns 500 when GITHUB_TOKEN is not set', async () => { delete process.env.GITHUB_TOKEN; const mod = await import('./route'); @@ -375,10 +398,12 @@ describe('GET /api/unofficial-run', () => { const res = await GET(makeRequest('runId=123')); expect(res.status).toBe(200); const body = await res.json(); - expect(body.runInfo.id).toBe(123); - expect(body.runInfo.isNonMainBranch).toBe(false); + expect(body.runInfos).toHaveLength(1); + expect(body.runInfos[0].id).toBe(123); + expect(body.runInfos[0].isNonMainBranch).toBe(false); expect(body.benchmarks).toHaveLength(1); expect(body.benchmarks[0].hardware).toBe('h200'); + expect(body.benchmarks[0].run_url).toBe('http://github.com/run/123'); expect(body.evaluations).toEqual([]); }); @@ -466,7 +491,154 @@ describe('GET /api/unofficial-run', () => { const res = await GET(makeRequest('runId=456')); expect(res.status).toBe(200); const body = await res.json(); - expect(body.runInfo.isNonMainBranch).toBe(true); + expect(body.runInfos).toHaveLength(1); + expect(body.runInfos[0].isNonMainBranch).toBe(true); expect(body.benchmarks).toHaveLength(0); }); + + it('merges data from multiple comma-separated runIds', async () => { + // Run 1 metadata + mockFetch.mockResolvedValueOnce({ + ok: true, + json: () => + Promise.resolve({ + id: 111, + name: 'run-1', + head_branch: 'feature/a', + head_sha: 'aaa', + created_at: '2026-01-01T00:00:00Z', + html_url: 'http://github.com/run/111', + conclusion: 'success', + status: 'completed', + }), + }); + // Run 1 artifacts + mockFetch.mockResolvedValueOnce({ + ok: true, + json: () => + Promise.resolve({ + artifacts: [{ name: 'results_bmk', id: 10, archive_download_url: 'http://dl-1' }], + }), + }); + // Run 1 download + mockFetch.mockResolvedValueOnce({ + ok: true, + arrayBuffer: () => Promise.resolve(new ArrayBuffer(8)), + }); + mockGetEntries.mockReturnValueOnce([ + { entryName: 'r1.json', getData: () => Buffer.from(JSON.stringify([rawRow()])) }, + ]); + + // Run 2 metadata + mockFetch.mockResolvedValueOnce({ + ok: true, + json: () => + Promise.resolve({ + id: 222, + name: 'run-2', + head_branch: 'feature/b', + head_sha: 'bbb', + created_at: '2026-01-02T00:00:00Z', + html_url: 'http://github.com/run/222', + conclusion: 'success', + status: 'completed', + }), + }); + // Run 2 artifacts + mockFetch.mockResolvedValueOnce({ + ok: true, + json: () => + Promise.resolve({ + artifacts: [{ name: 'results_bmk', id: 20, archive_download_url: 'http://dl-2' }], + }), + }); + // Run 2 download + mockFetch.mockResolvedValueOnce({ + ok: true, + arrayBuffer: () => Promise.resolve(new ArrayBuffer(8)), + }); + mockGetEntries.mockReturnValueOnce([ + { + entryName: 'r2.json', + getData: () => Buffer.from(JSON.stringify([rawRow({ hw: 'mi355x-amds' })])), + }, + ]); + + const res = await GET(makeRequest('runId=111,222')); + expect(res.status).toBe(200); + const body = await res.json(); + expect(body.runInfos).toHaveLength(2); + expect(body.runInfos.map((r: { id: number }) => r.id)).toEqual([111, 222]); + expect(body.benchmarks).toHaveLength(2); + // Each benchmark row is tagged with its originating run_url + expect(body.benchmarks[0].run_url).toBe('http://github.com/run/111'); + expect(body.benchmarks[1].run_url).toBe('http://github.com/run/222'); + }); + + it('dedupes repeated runIds in the comma-separated list', async () => { + // Only one set of fetches expected since 123 is deduped + mockFetch.mockResolvedValueOnce({ + ok: true, + json: () => + Promise.resolve({ + id: 123, + head_branch: 'main', + html_url: 'http://github.com/run/123', + created_at: '2026-01-01T00:00:00Z', + }), + }); + mockFetch.mockResolvedValueOnce({ + ok: true, + json: () => + Promise.resolve({ + artifacts: [{ name: 'results_bmk', id: 10, archive_download_url: 'http://dl' }], + }), + }); + mockFetch.mockResolvedValueOnce({ + ok: true, + arrayBuffer: () => Promise.resolve(new ArrayBuffer(8)), + }); + mockGetEntries.mockReturnValueOnce([]); + + const res = await GET(makeRequest('runId=123,123')); + expect(res.status).toBe(200); + const body = await res.json(); + expect(body.runInfos).toHaveLength(1); + // Only three fetches were made (run, artifacts, download) — not six + expect(mockFetch).toHaveBeenCalledTimes(3); + }); + + it('fails with the upstream status when any runId in the list errors', async () => { + // First run succeeds + mockFetch.mockResolvedValueOnce({ + ok: true, + json: () => + Promise.resolve({ + id: 111, + head_branch: 'main', + html_url: 'http://github.com/run/111', + created_at: '2026-01-01T00:00:00Z', + }), + }); + mockFetch.mockResolvedValueOnce({ + ok: true, + json: () => + Promise.resolve({ + artifacts: [{ name: 'results_bmk', id: 10, archive_download_url: 'http://dl' }], + }), + }); + mockFetch.mockResolvedValueOnce({ + ok: true, + arrayBuffer: () => Promise.resolve(new ArrayBuffer(8)), + }); + mockGetEntries.mockReturnValueOnce([]); + + // Second run 404s on metadata fetch + mockFetch.mockResolvedValueOnce({ ok: false, status: 404, statusText: 'Not Found' }); + + const res = await GET(makeRequest('runId=111,999')); + expect(res.status).toBe(404); + const body = await res.json(); + expect(body.error).toContain('999'); + }); }); diff --git a/packages/app/src/app/api/unofficial-run/route.ts b/packages/app/src/app/api/unofficial-run/route.ts index 79ac0665..4e5b5265 100644 --- a/packages/app/src/app/api/unofficial-run/route.ts +++ b/packages/app/src/app/api/unofficial-run/route.ts @@ -24,6 +24,7 @@ import { export function normalizeArtifactRows( rawRows: Record[], date: string, + runUrl: string | null = null, ): BenchmarkRow[] { const tracker = createSkipTracker(); const results: BenchmarkRow[] = []; @@ -55,7 +56,7 @@ export function normalizeArtifactRows( image: params.image, metrics: params.metrics, date, - run_url: null, + run_url: runUrl, }); } return results; @@ -82,32 +83,40 @@ function evalConfigKey(config: EvalParams['config']): string { ].join('|'); } -/** Normalize aggregate eval rows into the EvalRow shape the frontend expects. */ +/** + * Normalize aggregate eval rows into the EvalRow shape the frontend expects. + * + * When merging rows from multiple runs, pass `configIdOffset` so synthetic config + * ids from this batch don't collide with ids already emitted by earlier batches. + * Returns the rows and the maximum config id assigned, so the caller can advance + * the offset for the next batch. + */ export function normalizeEvalArtifactRows( rawRows: Record[], date: string, timestamp: string, runUrl: string, -): EvalRow[] { + configIdOffset = 0, +): { rows: EvalRow[]; maxConfigId: number } { const tracker = createSkipTracker(); const configIds = new Map(); - let nextConfigId = 1; - const results: EvalRow[] = []; + let nextLocalId = 1; + const rows: EvalRow[] = []; for (const raw of rawRows) { const params = mapAggEvalRow(raw as Record, tracker); if (!params) continue; const key = evalConfigKey(params.config); - let configId = configIds.get(key); - if (!configId) { - configId = nextConfigId; - configIds.set(key, configId); - nextConfigId += 1; + let localId = configIds.get(key); + if (!localId) { + localId = nextLocalId; + configIds.set(key, localId); + nextLocalId += 1; } - results.push({ - config_id: configId, + rows.push({ + config_id: configIdOffset + localId, hardware: params.config.hardware, framework: params.config.framework, model: params.config.model, @@ -134,7 +143,7 @@ export function normalizeEvalArtifactRows( }); } - return results; + return { rows, maxConfigId: configIdOffset + (nextLocalId - 1) }; } /** Extract all valid JSON files from a ZIP buffer; malformed JSON entries are skipped. */ @@ -161,10 +170,111 @@ async function downloadArtifactRows(archiveUrl: string, githubToken: string) { return { rows, errorResponse: null }; } +/** Parse the runId query param into a list of unique numeric ids. */ +function parseRunIds(raw: string | null): { ids: string[]; error: string | null } { + if (!raw) return { ids: [], error: 'runId must be provided' }; + const ids = [ + ...new Set( + raw + .split(',') + .map((s) => s.trim()) + .filter(Boolean), + ), + ]; + if (ids.length === 0 || !ids.every((id) => /^\d+$/.test(id))) { + return { ids: [], error: 'runId must be a comma-separated list of numeric values' }; + } + return { ids, error: null }; +} + +/** Fetch, download, and normalize data for a single run. Errors bubble as NextResponse. */ +async function processSingleRun( + runId: string, + githubToken: string, + evalConfigIdOffset: number, +): Promise< + | { errorResponse: NextResponse } + | { + errorResponse: null; + runInfo: ReturnType & { isNonMainBranch: boolean }; + benchmarks: BenchmarkRow[]; + evaluations: EvalRow[]; + nextEvalConfigIdOffset: number; + } +> { + const runResp = await fetchGithubWorkflowRun(runId, githubToken); + if (!runResp.ok) { + return { + errorResponse: NextResponse.json( + { error: `GitHub API error for runId ${runId}: ${runResp.statusText}` }, + { status: runResp.status }, + ), + }; + } + const run = (await runResp.json()) as GithubWorkflowRun; + + const artifacts = await fetchGithubRunArtifacts(runId, githubToken); + const bmkArtifact = artifacts + .filter((a) => a.name === 'results_bmk') + .toSorted((a, b) => b.id - a.id)[0]; + const evalArtifact = artifacts + .filter((a) => a.name === 'eval_results_all') + .toSorted((a, b) => b.id - a.id)[0]; + + if (!bmkArtifact && !evalArtifact) { + return { + errorResponse: NextResponse.json( + { + error: `No results_bmk or eval_results_all artifact found for runId ${runId}`, + }, + { status: 404 }, + ), + }; + } + + const date = getRunDate(run); + const runUrl = run.html_url ?? ''; + const timestamp = run.created_at ?? `${date}T00:00:00Z`; + let benchmarks: BenchmarkRow[] = []; + let evaluations: EvalRow[] = []; + let nextEvalConfigIdOffset = evalConfigIdOffset; + + if (bmkArtifact) { + const { rows, errorResponse } = await downloadArtifactRows( + bmkArtifact.archive_download_url, + githubToken, + ); + if (errorResponse) return { errorResponse }; + benchmarks = normalizeArtifactRows(rows, date, runUrl || null); + } + + if (evalArtifact) { + const { rows, errorResponse } = await downloadArtifactRows( + evalArtifact.archive_download_url, + githubToken, + ); + if (errorResponse) return { errorResponse }; + const normalized = normalizeEvalArtifactRows(rows, date, timestamp, runUrl, evalConfigIdOffset); + evaluations = normalized.rows; + nextEvalConfigIdOffset = normalized.maxConfigId; + } + + return { + errorResponse: null, + runInfo: { + ...normalizeGithubRunInfo(run), + isNonMainBranch: run.head_branch !== 'main', + }, + benchmarks, + evaluations, + nextEvalConfigIdOffset, + }; +} + export async function GET(request: NextRequest) { - const runId = request.nextUrl.searchParams.get('runId'); - if (!runId || !/^\d+$/.test(runId)) { - return NextResponse.json({ error: 'runId must be a numeric value' }, { status: 400 }); + const { ids: runIds, error: runIdError } = parseRunIds(request.nextUrl.searchParams.get('runId')); + if (runIdError) { + return NextResponse.json({ error: runIdError }, { status: 400 }); } const githubToken = getGithubToken(); @@ -173,63 +283,25 @@ export async function GET(request: NextRequest) { } try { - // Fetch workflow run metadata - const runResp = await fetchGithubWorkflowRun(runId, githubToken); - if (!runResp.ok) { - return NextResponse.json( - { error: `GitHub API: ${runResp.statusText}` }, - { status: runResp.status }, - ); - } - const run = (await runResp.json()) as GithubWorkflowRun; - - // Fetch artifacts, find latest benchmark/eval aggregates - const artifacts = await fetchGithubRunArtifacts(runId, githubToken); - - const bmkArtifact = artifacts - .filter((a) => a.name === 'results_bmk') - .toSorted((a, b) => b.id - a.id)[0]; + const runInfos: (ReturnType & { + isNonMainBranch: boolean; + })[] = []; + const benchmarks: BenchmarkRow[] = []; + const evaluations: EvalRow[] = []; + let evalConfigIdOffset = 0; - const evalArtifact = artifacts - .filter((a) => a.name === 'eval_results_all') - .toSorted((a, b) => b.id - a.id)[0]; - - if (!bmkArtifact && !evalArtifact) { - return NextResponse.json( - { error: 'No results_bmk or eval_results_all artifact found' }, - { status: 404 }, - ); - } - - const date = getRunDate(run); - const runUrl = run.html_url ?? ''; - const timestamp = run.created_at ?? `${date}T00:00:00Z`; - let benchmarks: BenchmarkRow[] = []; - let evaluations: EvalRow[] = []; - - if (bmkArtifact) { - const { rows, errorResponse } = await downloadArtifactRows( - bmkArtifact.archive_download_url, - githubToken, - ); - if (errorResponse) return errorResponse; - benchmarks = normalizeArtifactRows(rows, date); - } + for (const runId of runIds) { + const result = await processSingleRun(runId, githubToken, evalConfigIdOffset); + if (result.errorResponse) return result.errorResponse; - if (evalArtifact) { - const { rows, errorResponse } = await downloadArtifactRows( - evalArtifact.archive_download_url, - githubToken, - ); - if (errorResponse) return errorResponse; - evaluations = normalizeEvalArtifactRows(rows, date, timestamp, runUrl); + runInfos.push(result.runInfo); + benchmarks.push(...result.benchmarks); + evaluations.push(...result.evaluations); + evalConfigIdOffset = result.nextEvalConfigIdOffset; } return NextResponse.json({ - runInfo: { - ...normalizeGithubRunInfo(run), - isNonMainBranch: run.head_branch !== 'main', - }, + runInfos, benchmarks, evaluations, }); diff --git a/packages/app/src/app/globals.css b/packages/app/src/app/globals.css index a24b311b..50849070 100644 --- a/packages/app/src/app/globals.css +++ b/packages/app/src/app/globals.css @@ -130,6 +130,17 @@ --sidebar-accent-foreground: oklch(0.985 0 0); --sidebar-border: oklch(1 0 0 / 10%); --sidebar-ring: oklch(0.556 0 0); + + /* Overlay-run palette (light mode): darker + saturated so X-shape strokes and + * pastel legend swatches stay readable on the #eaebec background. */ + --overlay-run-0: oklch(0.55 0.23 25); /* deep red */ + --overlay-run-1: oklch(0.55 0.15 200); /* teal */ + --overlay-run-2: oklch(0.6 0.17 65); /* burnt amber */ + --overlay-run-3: oklch(0.5 0.25 290); /* royal violet */ + --overlay-run-4: oklch(0.55 0.18 150); /* forest green */ + --overlay-run-5: oklch(0.55 0.22 330); /* magenta */ + --overlay-run-6: oklch(0.55 0.2 230); /* blue */ + --overlay-run-7: oklch(0.6 0.17 60); /* amber-orange */ } .dark { @@ -163,6 +174,18 @@ --sidebar-accent-foreground: oklch(0.985 0 0); --sidebar-border: oklch(1 0 0 / 10%); --sidebar-ring: oklch(0.556 0 0); + + /* Overlay-run palette (dark mode): lighter + saturated so strokes pop on + * #131416. Hues align with the light-mode palette so each run index reads as + * the same conceptual color across themes. */ + --overlay-run-0: oklch(0.72 0.22 25); + --overlay-run-1: oklch(0.75 0.2 190); + --overlay-run-2: oklch(0.78 0.2 90); + --overlay-run-3: oklch(0.7 0.22 290); + --overlay-run-4: oklch(0.75 0.2 150); + --overlay-run-5: oklch(0.7 0.22 330); + --overlay-run-6: oklch(0.72 0.2 230); + --overlay-run-7: oklch(0.78 0.18 60); } /* ── Minecraft Theme ── */ @@ -204,6 +227,17 @@ /* Kill all border radius — Minecraft has no curves */ --radius: 0px; + + /* Overlay-run palette (minecraft theme): same as dark since background is + * also dark (#1e1e1e). */ + --overlay-run-0: oklch(0.72 0.22 25); + --overlay-run-1: oklch(0.75 0.2 190); + --overlay-run-2: oklch(0.78 0.2 90); + --overlay-run-3: oklch(0.7 0.22 290); + --overlay-run-4: oklch(0.75 0.2 150); + --overlay-run-5: oklch(0.7 0.22 330); + --overlay-run-6: oklch(0.72 0.2 230); + --overlay-run-7: oklch(0.78 0.18 60); } /* Force pixel font on everything in minecraft mode */ diff --git a/packages/app/src/components/evaluation/ui/BarChartD3.tsx b/packages/app/src/components/evaluation/ui/BarChartD3.tsx index be5f0211..6e77ebde 100644 --- a/packages/app/src/components/evaluation/ui/BarChartD3.tsx +++ b/packages/app/src/components/evaluation/ui/BarChartD3.tsx @@ -24,6 +24,7 @@ import { Skeleton } from '@/components/ui/skeleton'; import { useUnofficialRun } from '@/components/unofficial-run-provider'; import { useThemeColors } from '@/hooks/useThemeColors'; import { computeToggle } from '@/hooks/useTogglableSet'; +import { overlayRunColor, overlayRunIndex } from '@/lib/overlay-run-style'; const BASE_MARGIN = { top: 24, right: 24, bottom: 52 }; const OVERLAY_X_SIZE = 6; @@ -158,15 +159,37 @@ export default function EvalBarChartD3({ caption }: { caption?: ReactNode }) { const { isUnofficialRun, unofficialRunInfo, + unofficialRunInfos, activeOverlayHwTypes, setActiveOverlayHwTypes, allOverlayHwTypes, resetOverlayHwTypes, localOfficialOverride, setLocalOfficialOverride, + runIndexByUrl, } = useUnofficialRun(); const chartRef = useRef(null); + /** Look up the branch for an eval row via its `runUrl`, falling back to the + * first loaded run. Used so hovering an overlay bar shows that row's own + * branch across multi-run loads. */ + const branchForRow = useCallback( + (datum: EvaluationChartData): string | undefined => { + const url = datum.runUrl ?? null; + if (url) { + const direct = runIndexByUrl[url]; + if (direct !== undefined) return unofficialRunInfos[direct]?.branch; + const idMatch = url.match(/\/runs\/(\d+)/); + if (idMatch) { + const viaId = runIndexByUrl[idMatch[1]]; + if (viaId !== undefined) return unofficialRunInfos[viaId]?.branch; + } + } + return unofficialRunInfo?.branch ?? undefined; + }, + [runIndexByUrl, unofficialRunInfos, unofficialRunInfo], + ); + const effectiveOfficialHardware = localOfficialOverride ?? enabledHardware; const allUnifiedHwTypes = useMemo(() => { @@ -318,33 +341,45 @@ export default function EvalBarChartD3({ caption }: { caption?: ReactNode }) { const legendItems = useMemo( () => [ - ...unofficialConfigurations.map(({ hwKey, configLabel }) => ({ - name: `✕ ${configLabel}`, - label: `✕ ${configLabel.replaceAll('\n', ' ')}`, - color: resolveColor(configLabel, hwKey), - title: `UNOFFICIAL: ${configLabel.replaceAll('\n', ' ')}`, - isHighlighted: true, - hw: `overlay:${hwKey}`, - isActive: true, - onClick: () => {}, - tooltip: ( -
-
UNOFFICIAL RUN
-
Branch: {unofficialRunInfo?.branch}
-
Config: {configLabel.replaceAll('\n', ' ')}
- {unofficialRunInfo?.url && ( - - View workflow run - - )} -
- ), - })), + // Overlay legend: one entry per loaded unofficial run that contributes + // points to the current chart. Same palette color as the chart strokes. + ...(unofficialConfigurations.length > 0 && unofficialRunInfos.length > 0 + ? unofficialRunInfos + .map((info, idx) => { + const hasPoints = unofficialChartData.some( + (d) => overlayRunIndex(d.runUrl ?? null, runIndexByUrl) === idx, + ); + if (!hasPoints) return null; + const branch = info.branch || `run ${info.id}`; + return { + name: `✕ unofficial-run-${info.id}`, + label: `✕ ${branch}`, + color: overlayRunColor(idx), + title: `UNOFFICIAL: ${branch}`, + isHighlighted: true, + hw: `overlay-run-${info.id}`, + isActive: true, + onClick: () => {}, + tooltip: ( +
+
UNOFFICIAL RUN
+
Branch: {branch}
+ {info.url && ( + + View workflow run + + )} +
+ ), + }; + }) + .filter((x): x is NonNullable => x !== null) + : []), ...configurations.map(({ hwKey, configLabel }) => ({ name: configLabel, label: configLabel.replaceAll('\n', ' '), @@ -366,7 +401,9 @@ export default function EvalBarChartD3({ caption }: { caption?: ReactNode }) { highlightedConfigs, resolveColor, unofficialConfigurations, - unofficialRunInfo, + unofficialChartData, + unofficialRunInfos, + runIndexByUrl, ], ); @@ -535,11 +572,14 @@ export default function EvalBarChartD3({ caption }: { caption?: ReactNode }) { return bar; }); + bars.style('filter', null); bars .selectAll( '.unofficial-eb-stem, .unofficial-eb-cap-top, .unofficial-eb-cap-bot', ) - .attr('stroke', (d) => getCssColor(resolveColor(d.configLabel, String(d.hwKey)))); + .attr('stroke', (d) => + overlayRunColor(overlayRunIndex(d.runUrl ?? null, runIndexByUrl)), + ); bars .select('.unofficial-eb-stem') @@ -684,10 +724,13 @@ export default function EvalBarChartD3({ caption }: { caption?: ReactNode }) { (d) => `translate(${xScale(d.score)},${(yScale(d.configLabel) || 0) + yScale.bandwidth() / 2})`, ); + overlayPoints.style('filter', null); overlayPoints .select('.unofficial-eval-x') - .attr('stroke', (d) => getCssColor(resolveColor(d.configLabel, String(d.hwKey)))); + .attr('stroke', (d) => + overlayRunColor(overlayRunIndex(d.runUrl ?? null, runIndexByUrl)), + ); overlayPoints.each(function (d) { d3.select(this) @@ -716,13 +759,7 @@ export default function EvalBarChartD3({ caption }: { caption?: ReactNode }) { .style('opacity', 1) .style('display', 'block') .style('pointer-events', 'none') - .html( - generateEvaluationTooltipContent( - d, - false, - unofficialRunInfo?.branch ?? undefined, - ), - ); + .html(generateEvaluationTooltipContent(d, false, branchForRow(d))); }) .on('mousemove', function (event) { if (chartRef.current?.isPinned()) return; @@ -742,9 +779,7 @@ export default function EvalBarChartD3({ caption }: { caption?: ReactNode }) { event.stopPropagation(); const [mx, my] = d3.pointer(event, container); tooltip - .html( - generateEvaluationTooltipContent(d, true, unofficialRunInfo?.branch ?? undefined), - ) + .html(generateEvaluationTooltipContent(d, true, branchForRow(d))) .style('opacity', 1) .style('display', 'block') .style('pointer-events', 'auto'); @@ -774,7 +809,8 @@ export default function EvalBarChartD3({ caption }: { caption?: ReactNode }) { showLabels, unofficialChartData, unofficialErrorData, - unofficialRunInfo, + branchForRow, + runIndexByUrl, ], ); diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts index a23707ba..365923da 100644 --- a/packages/app/src/components/inference/types.ts +++ b/packages/app/src/components/inference/types.ts @@ -311,10 +311,17 @@ export interface OverlayData { data: InferenceData[]; /** Hardware configuration for the overlay data (may have different hardware types) */ hardwareConfig: HardwareConfig; - /** Label for the overlay (e.g., branch name) */ + /** Fallback label — branch of the first loaded run. Used when {@link getRunForRow} is absent + * or returns undefined (legacy single-run callers). */ label: string; - /** URL to the workflow run */ + /** Fallback URL — workflow URL of the first loaded run. */ runUrl?: string; + /** + * Per-point run lookup. Returns `{ branch, url }` of the run that produced + * the given overlay point. When multiple runs are loaded each point still + * shows its own branch/URL in the tooltip rather than the first run's. + */ + getRunForRow?: (row: InferenceData) => { branch: string; url: string } | undefined; } export interface ScatterGraphProps { diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx index 06ecc807..54611ed4 100644 --- a/packages/app/src/components/inference/ui/ChartDisplay.tsx +++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx @@ -165,7 +165,8 @@ export default function ChartDisplay() { track('inference_view_changed', { view: value, chartIndex: index }); }; - const { unofficialRunInfo, getOverlayData, isUnofficialRun } = useUnofficialRun(); + const { unofficialRunInfo, unofficialRunInfos, runIndexByUrl, getOverlayData, isUnofficialRun } = + useUnofficialRun(); // Compute overlay data for each chart type — must match useChartData processing const overlayDataByChartType = useMemo(() => { @@ -176,6 +177,23 @@ export default function ChartDisplay() { const e2eRaw = getOverlayData(selectedModel, selectedSequence, 'e2e'); const interactivityRaw = getOverlayData(selectedModel, selectedSequence, 'interactivity'); + // Per-row run lookup used by the overlay tooltip so hovering a point shows + // its OWN run's branch, not the banner-level first-run fallback. + const getRunForRow = (row: InferenceData) => { + const url = row.run_url ?? null; + if (!url) return undefined; + if (url in runIndexByUrl) { + const info = unofficialRunInfos[runIndexByUrl[url]]; + return info ? { branch: info.branch, url: info.url } : undefined; + } + const idMatch = url.match(/\/runs\/(\d+)/); + if (idMatch && idMatch[1] in runIndexByUrl) { + const info = unofficialRunInfos[runIndexByUrl[idMatch[1]]]; + return info ? { branch: info.branch, url: info.url } : undefined; + } + return undefined; + }; + const processData = ( rawData: { data: InferenceData[]; hardwareConfig: any } | null, chartType: 'e2e' | 'interactivity', @@ -197,6 +215,7 @@ export default function ChartDisplay() { hardwareConfig: rawData.hardwareConfig, label: unofficialRunInfo.branch, runUrl: unofficialRunInfo.url, + getRunForRow, }; }; @@ -206,6 +225,8 @@ export default function ChartDisplay() { }; }, [ unofficialRunInfo, + unofficialRunInfos, + runIndexByUrl, getOverlayData, selectedModel, selectedSequence, diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx index 06a7f531..e2a2a387 100644 --- a/packages/app/src/components/inference/ui/ScatterGraph.tsx +++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx @@ -21,6 +21,11 @@ import type { } from '@/lib/d3-chart/D3Chart/types'; import type { ContinuousScale } from '@/lib/d3-chart/types'; import { computeTooltipPosition } from '@/lib/d3-chart/layers/scatter-points'; +import { + overlayRooflineDasharray, + overlayRunColor, + overlayRunIndex, +} from '@/lib/overlay-run-style'; import { POINT_SIZE, HIT_AREA_RADIUS, @@ -148,6 +153,8 @@ const ScatterGraph = React.memo( resetOverlayHwTypes, localOfficialOverride, setLocalOfficialOverride, + runIndexByUrl, + unofficialRunInfos, } = useUnofficialRun(); const chartRef = useRef(null); @@ -324,17 +331,24 @@ const ScatterGraph = React.memo( }, [filteredData, processedOverlayData]); const overlayRooflines = useMemo(() => { - if (processedOverlayData.length === 0) return {}; + interface Entry { + hwKey: string; + runIndex: number; + points: InferenceData[]; + } + if (processedOverlayData.length === 0) return {} as Record; + // Group by hwKey + precision + runIndex so overlay rooflines from different + // unofficial runs stay separate and can be styled with per-run hue shifts. const grouped = processedOverlayData.reduce( (acc, p) => { - const key = `${p.hwKey}_${p.precision}`; - if (!acc[key]) acc[key] = []; - acc[key].push(p); + const runIndex = overlayRunIndex(p.run_url ?? null, runIndexByUrl); + const key = `${p.hwKey}_${p.precision}_run${runIndex}`; + if (!acc[key]) acc[key] = { hwKey: String(p.hwKey), runIndex, points: [] }; + acc[key].points.push(p); return acc; }, - {} as Record, + {} as Record, ); - const result: Record = {}; const rooflineKey = `${selectedYAxisMetric}_roofline` as keyof ChartDefinition; const dir = chartDefinition[rooflineKey] as | 'upper_right' @@ -342,20 +356,21 @@ const ScatterGraph = React.memo( | 'lower_left' | 'lower_right' | undefined; - for (const hw of Object.keys(grouped)) { + const result: Record = {}; + for (const [key, group] of Object.entries(grouped)) { const front = dir === 'upper_right' - ? paretoFrontUpperRight(grouped[hw]) + ? paretoFrontUpperRight(group.points) : dir === 'upper_left' - ? paretoFrontUpperLeft(grouped[hw]) + ? paretoFrontUpperLeft(group.points) : dir === 'lower_left' - ? paretoFrontLowerLeft(grouped[hw]) - : paretoFrontLowerRight(grouped[hw]); + ? paretoFrontLowerLeft(group.points) + : paretoFrontLowerRight(group.points); front.sort((a, b) => a.x - b.x); - result[hw] = front; + result[key] = { hwKey: group.hwKey, runIndex: group.runIndex, points: front }; } return result; - }, [processedOverlayData, selectedYAxisMetric, chartDefinition]); + }, [processedOverlayData, selectedYAxisMetric, chartDefinition, runIndexByUrl]); // All official points for rendering (unfiltered — visibility via opacity) const pointsData = useMemo(() => Object.values(groupedData).flat(), [groupedData]); @@ -1286,16 +1301,18 @@ const ScatterGraph = React.memo( key: string; points: InferenceData[]; stroke: string; + runIndex: number; } const ovEntries: OvEntry[] = []; - Object.entries(overlayRooflines).forEach(([key, pts]) => { - const hw = key.split('_').slice(0, -1).join('_'); - const hwCfg = overlayData.hardwareConfig[hw]; - if (hwCfg && pts.length > 1) { + Object.entries(overlayRooflines).forEach(([key, group]) => { + const hwCfg = overlayData.hardwareConfig[group.hwKey]; + if (hwCfg && group.points.length > 1) { ovEntries.push({ key, - points: pts, - stroke: getCssColor(resolveColor(hw)), + points: group.points, + // Color by run — same palette entry the legend uses, so they match. + stroke: overlayRunColor(group.runIndex), + runIndex: group.runIndex, }); } }); @@ -1312,8 +1329,9 @@ const ScatterGraph = React.memo( .attr('fill', 'none') .attr('stroke', (d) => d.stroke) .attr('stroke-width', 2) - .attr('stroke-dasharray', '6 3') - .attr('d', (d) => lineGen(d.points)); + .attr('stroke-dasharray', (d) => overlayRooflineDasharray(d.runIndex)) + .attr('d', (d) => lineGen(d.points)) + .style('filter', null); // Overlay X-shape points — index-keyed so every point renders const overlayPoints = zoomGroup @@ -1342,9 +1360,12 @@ const ScatterGraph = React.memo( }); overlayPoints.attr('transform', (d) => `translate(${xScale(d.x)},${yScale(d.y)})`); + overlayPoints.style('filter', null); overlayPoints .select('.overlay-x') - .attr('stroke', (d) => getCssColor(resolveColor(d.hwKey as string))); + .attr('stroke', (d) => + overlayRunColor(overlayRunIndex(d.run_url ?? null, runIndexByUrl)), + ); // Labels const showLabels = !hidePointLabels && !showGradientLabels; @@ -1454,10 +1475,10 @@ const ScatterGraph = React.memo( .y((d) => newYScale(d.y)) .curve(d3.curveMonotoneX); - Object.entries(overlayRooflines).forEach(([key, pts]) => { - if (pts.length < 2) return; + Object.entries(overlayRooflines).forEach(([key, group]) => { + if (group.points.length < 2) return; const sel = zoomGroup.select(`.overlay-roofline-${key}`); - if (!sel.empty()) sel.attr('d', lineGen(pts) as string); + if (!sel.empty()) sel.attr('d', lineGen(group.points) as string); }); // Update overlay points @@ -1490,6 +1511,7 @@ const ScatterGraph = React.memo( overlayData, processedOverlayData, overlayRooflines, + runIndexByUrl, hardwareConfig, xLabel, yLabel, @@ -1672,32 +1694,35 @@ const ScatterGraph = React.memo( onItemHoverEnd={handleLegendHoverEnd} onItemRemove={showAllHardwareTypes ? undefined : removeHwType} legendItems={[ - ...(overlayData - ? Object.entries(overlayData.hardwareConfig) - .filter(([key]) => - overlayData.data.some( - (d) => d.hwKey === key && selectedPrecisions.includes(d.precision), - ), - ) - .map(([key, hwConfig]) => { - const parsed = parseHwKeyToLabel(key); + // Overlay legend: one entry per loaded unofficial run that actually + // contributes points to this chart. Colored from the shared palette + // so the legend swatch matches the stroke color used in the chart. + ...(overlayData && unofficialRunInfos.length > 0 + ? unofficialRunInfos + .map((info, idx) => { + const hasPoints = overlayData.data.some( + (d) => + overlayRunIndex(d.run_url ?? null, runIndexByUrl) === idx && + selectedPrecisions.includes(d.precision), + ); + if (!hasPoints) return null; + const branch = info.branch || `run ${info.id}`; return { - name: `✕ ${key}`, - label: `✕ ${parsed.label}`, - color: resolveColor(key), - title: `UNOFFICIAL: ${hwConfig.framework || parsed.label}`, + name: `✕ unofficial-run-${info.id}`, + label: `✕ ${branch}`, + color: overlayRunColor(idx), + title: `UNOFFICIAL: ${branch}`, isHighlighted: true, - hw: `overlay-${key}`, + hw: `overlay-run-${info.id}`, isActive: true, onClick: () => {}, tooltip: (
UNOFFICIAL RUN
-
Branch: {overlayData.label}
-
Hardware: {parsed.label}
- {overlayData.runUrl && ( +
Branch: {branch}
+ {info.url && ( => x !== null) : []), ...Object.entries(hardwareConfig) .filter(([key]) => diff --git a/packages/app/src/components/inference/utils/tooltipUtils.ts b/packages/app/src/components/inference/utils/tooltipUtils.ts index e88e9930..eca6c8ca 100644 --- a/packages/app/src/components/inference/utils/tooltipUtils.ts +++ b/packages/app/src/components/inference/utils/tooltipUtils.ts @@ -204,6 +204,8 @@ export const generateTooltipContent = (config: TooltipConfig): string => { export const generateOverlayTooltipContent = (config: OverlayTooltipConfig): string => { const { data: d, isPinned, xLabel, yLabel, overlayData } = config; const hwConfig = overlayData.hardwareConfig[d.hwKey]; + const perRow = overlayData.getRunForRow?.(d); + const branch = perRow?.branch ?? overlayData.label; return `
@@ -215,7 +217,7 @@ export const generateOverlayTooltipContent = (config: OverlayTooltipConfig): str ${hwConfig ? getDisplayLabel(hwConfig) : d.hwKey}
- Branch: ${overlayData.label} + Branch: ${branch}
Date: ${d.actualDate ?? d.date} diff --git a/packages/app/src/components/ui/unofficial-banner.tsx b/packages/app/src/components/ui/unofficial-banner.tsx index 983adb1d..d1274c4c 100644 --- a/packages/app/src/components/ui/unofficial-banner.tsx +++ b/packages/app/src/components/ui/unofficial-banner.tsx @@ -3,60 +3,120 @@ import { AlertTriangle, ExternalLink, X } from 'lucide-react'; import { track } from '@/lib/analytics'; +import { overlayRunColor } from '@/lib/overlay-run-style'; + +interface RunInfo { + id: number; + name: string; + branch: string; + sha: string; + createdAt: string; + url: string; +} interface UnofficialBannerProps { - runInfo: { - id: number; - name: string; - branch: string; - sha: string; - createdAt: string; - url: string; - }; - onDismiss?: () => void; + runs: RunInfo[]; + /** Remove a single run from the URL + state. */ + onDismissRun?: (runId: string) => void; + /** Clear all runs at once. Surfaced as "Dismiss all" when `runs.length > 1`. */ + onDismissAll?: () => void; } -export function UnofficialBanner({ runInfo, onDismiss }: UnofficialBannerProps) { +/** + * Compact banner that advertises that the page is showing unofficial run data. + * + * When multiple runs are loaded, each gets a chip with a color swatch (matching + * the chart's per-run color from {@link overlayRunColor}), a link to the + * workflow run, and its own dismiss `×`. A single "Dismiss all" button is + * rendered at the right edge when more than one run is loaded. Previously each + * run rendered its OWN full-width banner and the dismiss button cleared every + * run, which both wasted vertical space and made partial dismissal impossible. + */ +export function UnofficialBanner({ runs, onDismissRun, onDismissAll }: UnofficialBannerProps) { + if (runs.length === 0) return null; + const multiple = runs.length > 1; + return ( -
-
-
- -
- NON-OFFICIAL - - Viewing data from branch:{' '} - - {runInfo.branch} - - +
+
+
+ +
+
+ NON-OFFICIAL + + {multiple ? `Viewing ${runs.length} runs` : 'Viewing data from branch'} + +
+
+ {runs.map((run, idx) => ( + onDismissRun(String(run.id)) : undefined} + /> + ))} +
-
+ + Dismiss all + + )}
); } + +function RunChip({ + run, + color, + onDismiss, +}: { + run: RunInfo; + color: string; + onDismiss?: () => void; +}) { + return ( + + + track('unofficial_banner_view_run', { branch: run.branch })} + className="inline-flex items-center gap-0.5 underline-offset-2 hover:underline" + aria-label={`View workflow run for ${run.branch}`} + > + {run.branch} + + + {onDismiss && ( + + )} + + ); +} diff --git a/packages/app/src/components/unofficial-run-provider.tsx b/packages/app/src/components/unofficial-run-provider.tsx index a44d5c1c..affd7279 100644 --- a/packages/app/src/components/unofficial-run-provider.tsx +++ b/packages/app/src/components/unofficial-run-provider.tsx @@ -51,12 +51,28 @@ interface AvailableModelSequence { export interface UnofficialRunContextType { isUnofficialRun: boolean; + /** First run in the loaded set — kept as a convenience alias for overlay labels. */ unofficialRunInfo: UnofficialRunInfo | null; + /** All runs loaded from the `unofficialrun(s)` URL param (comma-separated). */ + unofficialRunInfos: UnofficialRunInfo[]; + /** + * Position of each run in the loaded set, keyed by both `run.url` and the + * numeric id as a string. Used to derive a distinct hue shift per run for + * overlay points so multiple runs are visually separable. + */ + runIndexByUrl: Record; unofficialChartData: UnofficialChartData | null; unofficialEvalRows: EvalRow[] | null; loading: boolean; error: string | null; + /** Clear every unofficial run. Wipes state + URL. */ clearUnofficialRun: () => void; + /** + * Drop a single run ID. Rewrites the URL to the remaining IDs and filters + * local state (chart data + eval rows + run infos) by `run_url` without + * refetching the others. + */ + dismissRun: (runId: string) => void; availableModelsAndSequences: AvailableModelSequence[]; getOverlayData: ( model: Model, @@ -150,7 +166,8 @@ export function parseAvailableModelsAndSequences( } export function UnofficialRunProvider({ children }: { children: ReactNode }) { - const [unofficialRunInfo, setUnofficialRunInfo] = useState(null); + const [unofficialRunInfos, setUnofficialRunInfos] = useState([]); + const unofficialRunInfo = unofficialRunInfos[0] ?? null; const [unofficialChartData, setUnofficialChartData] = useState(null); const [unofficialEvalRows, setUnofficialEvalRows] = useState(null); const [loading, setLoading] = useState(false); @@ -212,7 +229,7 @@ export function UnofficialRunProvider({ children }: { children: ReactNode }) { ); const clearUnofficialRun = useCallback(() => { - setUnofficialRunInfo(null); + setUnofficialRunInfos([]); setUnofficialChartData(null); setUnofficialEvalRows(null); setError(null); @@ -224,6 +241,97 @@ export function UnofficialRunProvider({ children }: { children: ReactNode }) { window.history.pushState({}, '', url); }, []); + /** + * Drop a single run from the URL + state. Since benchmark rows are tagged + * with `run_url` and eval rows have their own `run_url`, we can filter local + * state by the dismissed run's URL/id without refetching the remaining runs. + */ + const dismissRun = useCallback( + (runId: string) => { + const target = unofficialRunInfos.find((r) => String(r.id) === runId); + if (!target) return; + + const remaining = unofficialRunInfos.filter((r) => String(r.id) !== runId); + + // Rewrite URL to the remaining IDs (or drop param if none left). + const url = new URL(window.location.href); + const existingKeys: string[] = []; + for (const key of url.searchParams.keys()) { + if (UNOFFICIAL_RUN_PARAM_RE.test(key)) existingKeys.push(key); + } + for (const key of existingKeys) url.searchParams.delete(key); + if (remaining.length > 0) { + url.searchParams.set('unofficialrun', remaining.map((r) => r.id).join(',')); + } + window.history.pushState({}, '', url); + + if (remaining.length === 0) { + setUnofficialRunInfos([]); + setUnofficialChartData(null); + setUnofficialEvalRows(null); + setError(null); + setAvailableModelsAndSequences([]); + return; + } + + setUnofficialRunInfos(remaining); + + // Filter chart data by stamped `run_url`. A row belongs to the dismissed + // run if its URL matches exactly OR the numeric id parses to the same. + const belongsToDismissed = (rowUrl?: string | null) => { + if (!rowUrl) return false; + if (rowUrl === target.url) return true; + const m = rowUrl.match(/\/runs\/(\d+)/); + return m !== null && m[1] === runId; + }; + + // Compute the filtered chart data BEFORE any setState so we can pass the + // same value to setUnofficialChartData and parseAvailableModelsAndSequences. + // Writing to an outer variable from inside a setState updater and then + // reading it synchronously is unsafe: React 18 invokes updaters during + // render, not at the call site, so the read would see the initial null. + const nextChartData: UnofficialChartData | null = unofficialChartData + ? (() => { + const next: UnofficialChartData = {}; + for (const [key, group] of Object.entries(unofficialChartData)) { + const e2eData = group.e2e.data.filter((d) => !belongsToDismissed(d.run_url)); + const intvData = group.interactivity.data.filter( + (d) => !belongsToDismissed(d.run_url), + ); + if (e2eData.length === 0 && intvData.length === 0) continue; + next[key] = { + e2e: { data: e2eData, gpus: group.e2e.gpus }, + interactivity: { data: intvData, gpus: group.interactivity.gpus }, + }; + } + return next; + })() + : null; + setUnofficialChartData(nextChartData); + // Re-derive available (model, sequence) pairs from surviving runs so the + // model/sequence picker doesn't still offer combos that only existed in + // the dismissed run. + setAvailableModelsAndSequences(parseAvailableModelsAndSequences(nextChartData)); + + setUnofficialEvalRows((prev) => + prev ? prev.filter((row) => !belongsToDismissed(row.run_url)) : prev, + ); + }, + [unofficialRunInfos, unofficialChartData], + ); + + // Build a url → index lookup. Keyed by the full run.url AND by the numeric id + // as a string, since `updateRepoUrl` can rewrite hosts/orgs between the + // overlay rendering path and the run metadata. + const runIndexByUrl = useMemo(() => { + const map: Record = {}; + unofficialRunInfos.forEach((info, idx) => { + if (info.url) map[info.url] = idx; + if (info.id !== undefined && info.id !== null) map[String(info.id)] = idx; + }); + return map; + }, [unofficialRunInfos]); + const getOverlayData = useCallback( (model: Model, sequence: Sequence, chartType: 'e2e' | 'interactivity') => { if (!unofficialChartData) return null; @@ -239,15 +347,15 @@ export function UnofficialRunProvider({ children }: { children: ReactNode }) { useEffect(() => { const load = () => { const params = new URLSearchParams(window.location.search); - let unofficialRunId: string | undefined; + let unofficialRunIdParam: string | undefined; for (const [key, value] of params) { if (UNOFFICIAL_RUN_PARAM_RE.test(key) && value) { - unofficialRunId = value; + unofficialRunIdParam = value; break; } } - if (!unofficialRunId) { - setUnofficialRunInfo(null); + if (!unofficialRunIdParam) { + setUnofficialRunInfos([]); setUnofficialChartData(null); setUnofficialEvalRows(null); setError(null); @@ -258,12 +366,14 @@ export function UnofficialRunProvider({ children }: { children: ReactNode }) { setLoading(true); setError(null); - fetch(`/api/unofficial-run?runId=${unofficialRunId}`) + // Pass the raw param value through — it may be a single id or a comma-separated list. + // encodeURIComponent preserves commas while escaping any accidental whitespace/symbols. + fetch(`/api/unofficial-run?runId=${encodeURIComponent(unofficialRunIdParam)}`) .then(async (response) => { const data = await response.json(); if (!response.ok) throw new Error(data.error || 'Failed to fetch unofficial run'); - setUnofficialRunInfo(data.runInfo); + setUnofficialRunInfos(Array.isArray(data.runInfos) ? data.runInfos : []); const chartData = buildChartData(data.benchmarks ?? []); setUnofficialChartData(chartData); setUnofficialEvalRows(data.evaluations ?? []); @@ -271,7 +381,7 @@ export function UnofficialRunProvider({ children }: { children: ReactNode }) { }) .catch((caughtError) => { setError(caughtError instanceof Error ? caughtError.message : 'Unknown error'); - setUnofficialRunInfo(null); + setUnofficialRunInfos([]); setUnofficialChartData(null); setUnofficialEvalRows(null); setAvailableModelsAndSequences([]); @@ -287,13 +397,16 @@ export function UnofficialRunProvider({ children }: { children: ReactNode }) { return ( 0, unofficialRunInfo, + unofficialRunInfos, + runIndexByUrl, unofficialChartData, unofficialEvalRows, loading, error, clearUnofficialRun, + dismissRun, availableModelsAndSequences, getOverlayData, activeOverlayHwTypes, @@ -305,8 +418,12 @@ export function UnofficialRunProvider({ children }: { children: ReactNode }) { setLocalOfficialOverride, }} > - {unofficialRunInfo && ( - + {unofficialRunInfos.length > 0 && ( + )} {children} diff --git a/packages/app/src/lib/overlay-run-style.ts b/packages/app/src/lib/overlay-run-style.ts new file mode 100644 index 00000000..0cd57610 --- /dev/null +++ b/packages/app/src/lib/overlay-run-style.ts @@ -0,0 +1,71 @@ +/** + * Shared helpers for visually differentiating unofficial-run overlay points + * when one or more runs are loaded. Consumed by the inference scatter plot + * and the evaluation bar chart. + * + * Design: instead of applying a CSS filter to an hwKey-derived base color + * (which is brittle — `hue-rotate` on gray is a no-op, and filter output + * can't be re-used in legend swatches that style `background-color` directly), + * we assign each run a fixed palette color. The same palette is used by the + * chart strokes AND the legend entries, so they always match visually. + * + * Trade-off: overlay points no longer encode hardware via color. Hardware is + * still identifiable via the X-mark shape, the point label (TP number or + * advanced label), and the tooltip. + */ + +/** + * Number of entries in the overlay-run palette. The actual color values are + * theme-aware CSS custom properties defined in `globals.css` as + * `--overlay-run-0` .. `--overlay-run-`; light mode uses darker/saturated + * hues for contrast on a light background, dark/minecraft modes use the + * lighter hues this file used to hard-code. + */ +const RUN_PALETTE_SIZE = 8; + +/** + * Return the palette color for a given run index (wraps on overflow). + * Resolves to a theme-aware CSS variable so charts + legend swatches restain + * automatically when the user toggles light/dark. + */ +export function overlayRunColor(runIndex: number): string { + const slot = ((runIndex % RUN_PALETTE_SIZE) + RUN_PALETTE_SIZE) % RUN_PALETTE_SIZE; + return `var(--overlay-run-${slot})`; +} + +/** + * Dash pattern for an overlay roofline at a given run index. Layered on top + * of the per-run color so runs stay distinguishable even on grayscale + * screenshots or print. + */ +const ROOFLINE_DASH_BY_RUN: readonly string[] = [ + '6 3', + '2 3', + '10 3 2 3', + '5 3 2 3 2 3', + '12 2', + '3 1', +]; +export function overlayRooflineDasharray(runIndex: number): string { + return ROOFLINE_DASH_BY_RUN[ + ((runIndex % ROOFLINE_DASH_BY_RUN.length) + ROOFLINE_DASH_BY_RUN.length) % + ROOFLINE_DASH_BY_RUN.length + ]; +} + +/** + * Resolve a point's run index from its `run_url`. Falls back to parsing the + * numeric id out of `/runs/` — needed because `updateRepoUrl` may + * rewrite the host/org between the raw URL stored on the point and the + * lookup map constructed from run metadata. + */ +export function overlayRunIndex( + runUrl: string | null | undefined, + map: Record, +): number { + if (!runUrl) return 0; + if (runUrl in map) return map[runUrl]; + const idMatch = runUrl.match(/\/runs\/(\d+)/); + if (idMatch && idMatch[1] in map) return map[idMatch[1]]; + return 0; +} diff --git a/packages/db/src/etl/normalizers.test.ts b/packages/db/src/etl/normalizers.test.ts index 7b0ac0e2..5598b985 100644 --- a/packages/db/src/etl/normalizers.test.ts +++ b/packages/db/src/etl/normalizers.test.ts @@ -116,6 +116,11 @@ describe('resolveModelKey', () => { expect(resolveModelKey({ infmax_model_prefix: 'gptoss' })).toBe('gptoss120b'); }); + it('resolves dsv4pro alias from prefix', () => { + expect(resolveModelKey({ infmax_model_prefix: 'dsv4pro' })).toBe('dsv4'); + expect(resolveModelKey({ infmax_model_prefix: 'dsv4pro-fp8' })).toBe('dsv4'); + }); + it('falls back to MODEL_TO_KEY when prefix not present', () => { expect(resolveModelKey({ model: 'deepseek-ai/DeepSeek-R1' })).toBe('dsr1'); expect(resolveModelKey({ model: 'nvidia/Llama-3.3-70B-Instruct-FP8' })).toBe('llama70b'); diff --git a/packages/db/src/etl/normalizers.ts b/packages/db/src/etl/normalizers.ts index f3cc13dd..51b00df6 100644 --- a/packages/db/src/etl/normalizers.ts +++ b/packages/db/src/etl/normalizers.ts @@ -52,6 +52,7 @@ const PRECISION_SUFFIX = /-(?:fp4|fp8|mxfp4|nvfp4)(?:-.*)?$/i; /** Explicit aliases for prefixes that don't match any DB key after suffix stripping. */ const PREFIX_ALIASES: Record = { gptoss: 'gptoss120b', + dsv4pro: 'dsv4', }; function resolvePrefixToKey(prefix: string): string | null {