From 5164b65798127fb3c85a53646aa0c54c19983bab Mon Sep 17 00:00:00 2001 From: Austin Kelsay Date: Mon, 30 Mar 2026 05:05:11 -0700 Subject: [PATCH] dashboard layout improvements --- .../leaderboard/leaderboard-chart-gallery.tsx | 15 +++--- .../leaderboard/leaderboard-summary-cards.tsx | 4 +- .../components/run-detail/run-detail-page.tsx | 46 +++++++++--------- memory/MEMORY.md | 47 +++++++++++++++++++ src/lib/benchmark-checkpoint.ts | 5 ++ test/benchmark-checkpoint.test.ts | 26 ++++++++++ test/build-index.test.ts | 5 ++ 7 files changed, 115 insertions(+), 33 deletions(-) create mode 100644 memory/MEMORY.md diff --git a/apps/dashboard/src/components/leaderboard/leaderboard-chart-gallery.tsx b/apps/dashboard/src/components/leaderboard/leaderboard-chart-gallery.tsx index 063e972..8dff74a 100644 --- a/apps/dashboard/src/components/leaderboard/leaderboard-chart-gallery.tsx +++ b/apps/dashboard/src/components/leaderboard/leaderboard-chart-gallery.tsx @@ -5,7 +5,7 @@ * Invariants: * - Renders aggregate charts before filter/table sections * - Uses filtered aggregate items so charts stay aligned with visible leaderboard scope - * - Hierarchy: heatmap → composite → pass+blind → radar+token → failure+difficulty → timing+frontier → head-to-head + * - Hierarchy: composite → heatmap → pass+blind → radar+token → failure+difficulty → timing+frontier → head-to-head */ import { BlindVsInformedChart } from "@/components/charts/blind-vs-informed-chart"; @@ -44,9 +44,8 @@ export function LeaderboardChartGallery({ Aggregate Charts

- The leaderboard now opens on the aggregate signal: rank, prompt - delta, timing, frontier alignment, and pass-rate breakdowns across - the latest checkpoint. + Aggregate signals for the latest checkpoint: model rankings, prompt + delta, timing, frontier alignment, and pass-rate breakdowns.

@@ -54,12 +53,12 @@ export function LeaderboardChartGallery({ - {/* 1. Hero: Model x Test Heatmap — full overview at a glance */} - - - {/* 2. Composite Score — primary analytical ranking */} + {/* 1. Composite Score — primary ranking: which model is best */} + {/* 2. Model x Test Heatmap — per-test breakdown after rank context */} + + {/* 3. Core metrics: Pass Rate + Blind vs Informed */}
diff --git a/apps/dashboard/src/components/leaderboard/leaderboard-summary-cards.tsx b/apps/dashboard/src/components/leaderboard/leaderboard-summary-cards.tsx index f0de93f..46f4487 100644 --- a/apps/dashboard/src/components/leaderboard/leaderboard-summary-cards.tsx +++ b/apps/dashboard/src/components/leaderboard/leaderboard-summary-cards.tsx @@ -27,7 +27,7 @@ const CARD_ACCENT_COLORS = [ "hsl(212, 100%, 67%)", // info blue — deduped items "hsl(156, 67%, 55%)", // success green — pass rate "hsl(270, 60%, 60%)", // purple — frontier coverage - "hsl(210, 85%, 60%)", // blue — avg duration + "hsl(210, 85%, 60%)", // blue — median duration ]; /** @@ -97,7 +97,7 @@ export function LeaderboardSummaryCards({ sub: `${frontierCount} of ${filteredItemCount} items`, }, { - title: "Avg Duration", + title: "Median Duration", value: medianDuration !== null ? formatDuration(medianDuration) : "—", sub: durations.length > 0 ? `${durations.length} items` : "no data", }, diff --git a/apps/dashboard/src/components/run-detail/run-detail-page.tsx b/apps/dashboard/src/components/run-detail/run-detail-page.tsx index cf4342c..b648c5a 100644 --- a/apps/dashboard/src/components/run-detail/run-detail-page.tsx +++ b/apps/dashboard/src/components/run-detail/run-detail-page.tsx @@ -224,29 +224,6 @@ export function RunDetailPage({ run, plan }: RunDetailPageProps) { - - - {/* Matrix Table */} - - - Results Matrix - - - - - - - {/* Breakdowns */} -
- - -
- -
- - -
- {/* Primary Chart - Composite Scores */}
+ {/* Breakdowns */} +
+ + +
+ +
+ + +
+ + + + + {/* Matrix Table */} + + + Results Matrix + + + + + diff --git a/memory/MEMORY.md b/memory/MEMORY.md new file mode 100644 index 0000000..3b84107 --- /dev/null +++ b/memory/MEMORY.md @@ -0,0 +1,47 @@ +# plebdev-bench Memory + +## Project +Local LLM benchmark runner — CLI-driven test harness + scoring pipeline. +Stack: Bun + TypeScript, Zod, Vitest, Pino, Recharts (dashboard), React/Vite. + +## Key Paths +- `src/` — CLI, runner, harnesses, runtimes, lib, schemas, tests +- `apps/dashboard/` — React dashboard (Vite + Tailwind + Recharts) +- `apps/dashboard/public/results/` — Published static run artifacts +- `results/` — Raw timestamped run outputs (plan.json + run.json) +- `llm/project/` — Canonical project docs (read before making changes) + +## Dashboard Architecture +- Data source: static JSON files under `apps/dashboard/public/results/` + - `index.json` — DashboardIndex listing all runs (v3 schema) + - `/plan.json` + `/run.json` — Per-run artifacts + - `aggregates/latest.json` — Latest checkpoint aggregate for leaderboard + - `aggregates/.json` — Per-checkpoint aggregates +- 4 pages: Leaderboard, Run List, Run Detail, About +- Key lib files: `src/lib/api.ts`, `src/lib/types.ts`, `src/lib/aggregations*.ts` + +## Current Data State (as of 2026-03-30) +- Latest checkpoint: `chk_sha256v1_432187a085f7` +- 12 total runs indexed, 3 match latest checkpoint +- 1512 deduped items, 1 machine (Apple M4 Pro / 64GB) +- 24 models tested: qwen3.x, qwen2.5, devstral, gpt-oss:20b, etc. +- 3 harnesses: direct, goose, opencode +- 16 tests: smoke, calculator-*, todo-app, tool-smoke, rate-limiter, ttl-cache, event-emitter, workspace-*, file-*, targeted-edit, safe-cleanup +- Top models: gpt-oss:20b (98.5%), qwen3.5:27b (97.1%), devstral-small-2:24b / qwen3:8b (94.6%) +- Bottom: llama3.2:3b (18.9%), qwen3.5:0.8b (20.3%) + +## Composite Score Formula +effectiveScore = passRate × 0.4 + completionRate × 0.3 + toolSuccessRate × 0.3 + +## Checkpoint System +- `src/lib/benchmark-checkpoint.ts` — SHA-256 hash of all test assets + core lib files +- Checkpoints roll when test prompts, scoring specs, or core pipeline code changes +- Leaderboard only shows runs matching the latest checkpoint + +## Aggregation / Dedup Logic +- Best result per (machineInstanceId × matrixKey) across runs within a checkpoint +- Published via `bun dashboard:index` command +- Leaderboard uses `aggregates/latest.json` only + +## User Preferences +- (none recorded yet) diff --git a/src/lib/benchmark-checkpoint.ts b/src/lib/benchmark-checkpoint.ts index 0801d3d..ccc0413 100644 --- a/src/lib/benchmark-checkpoint.ts +++ b/src/lib/benchmark-checkpoint.ts @@ -30,8 +30,13 @@ const CORE_BENCHMARK_LIB_ASSETS = [ "src/lib/benchmark-checkpoint.ts", "src/lib/scorer.ts", "src/lib/scorer-core.ts", + "src/lib/code-module-scorer.ts", "src/lib/scorer-worker.ts", "src/lib/scoring-spec.ts", + "src/lib/workspace-scorer.ts", + "src/lib/workspace-manifest.ts", + "src/lib/test-workspace.ts", + "src/lib/signal-assessment.ts", "src/lib/code-extractor.ts", "src/lib/stdout-suppressor.ts", "src/lib/test-catalog.ts", diff --git a/test/benchmark-checkpoint.test.ts b/test/benchmark-checkpoint.test.ts index 6249ba4..acef21b 100644 --- a/test/benchmark-checkpoint.test.ts +++ b/test/benchmark-checkpoint.test.ts @@ -21,8 +21,13 @@ const REQUIRED_LIB_ASSETS = [ "benchmark-checkpoint.ts", "scorer.ts", "scorer-core.ts", + "code-module-scorer.ts", "scorer-worker.ts", "scoring-spec.ts", + "workspace-scorer.ts", + "workspace-manifest.ts", + "test-workspace.ts", + "signal-assessment.ts", "code-extractor.ts", "stdout-suppressor.ts", "test-catalog.ts", @@ -141,12 +146,33 @@ describe("benchmark checkpoint", () => { expect(after.checkpointId).not.toBe(before.checkpointId); }); + it("changes manifest hash when signal assessment changes", () => { + const root = createBenchmarkRoot(); + const before = computeBenchmarkCheckpoint(root); + + const signalAssessmentPath = path.join( + root, + "src", + "lib", + "signal-assessment.ts", + ); + fs.writeFileSync( + signalAssessmentPath, + 'export const signal_assessment_ts = "updated";\n', + ); + + const after = computeBenchmarkCheckpoint(root); + expect(after.manifestHash).not.toBe(before.manifestHash); + expect(after.checkpointId).not.toBe(before.checkpointId); + }); + it("collects a deterministic sorted asset list", () => { const root = createBenchmarkRoot(); const assets = collectBenchmarkAssetPaths(root); expect(assets).toEqual([...assets].sort((a, b) => a.localeCompare(b))); expect(assets).toContain("src/harnesses/direct-adapter.ts"); expect(assets).toContain("src/lib/scorer.ts"); + expect(assets).toContain("src/lib/signal-assessment.ts"); expect(assets).toContain("src/tests/smoke/prompt.blind.md"); }); }); diff --git a/test/build-index.test.ts b/test/build-index.test.ts index 5df0cf7..8624120 100644 --- a/test/build-index.test.ts +++ b/test/build-index.test.ts @@ -29,8 +29,13 @@ const REQUIRED_LIB_ASSETS = [ "benchmark-checkpoint.ts", "scorer.ts", "scorer-core.ts", + "code-module-scorer.ts", "scorer-worker.ts", "scoring-spec.ts", + "workspace-scorer.ts", + "workspace-manifest.ts", + "test-workspace.ts", + "signal-assessment.ts", "code-extractor.ts", "stdout-suppressor.ts", "test-catalog.ts",