diff --git a/apps/dashboard/src/components/leaderboard/leaderboard-chart-gallery.tsx b/apps/dashboard/src/components/leaderboard/leaderboard-chart-gallery.tsx
index 063e972..8dff74a 100644
--- a/apps/dashboard/src/components/leaderboard/leaderboard-chart-gallery.tsx
+++ b/apps/dashboard/src/components/leaderboard/leaderboard-chart-gallery.tsx
@@ -5,7 +5,7 @@
* Invariants:
* - Renders aggregate charts before filter/table sections
* - Uses filtered aggregate items so charts stay aligned with visible leaderboard scope
- * - Hierarchy: heatmap → composite → pass+blind → radar+token → failure+difficulty → timing+frontier → head-to-head
+ * - Hierarchy: composite → heatmap → pass+blind → radar+token → failure+difficulty → timing+frontier → head-to-head
*/
import { BlindVsInformedChart } from "@/components/charts/blind-vs-informed-chart";
@@ -44,9 +44,8 @@ export function LeaderboardChartGallery({
Aggregate Charts
- The leaderboard now opens on the aggregate signal: rank, prompt
- delta, timing, frontier alignment, and pass-rate breakdowns across
- the latest checkpoint.
+ Aggregate signals for the latest checkpoint: model rankings, prompt
+ delta, timing, frontier alignment, and pass-rate breakdowns.
@@ -54,12 +53,12 @@ export function LeaderboardChartGallery({
- {/* 1. Hero: Model x Test Heatmap — full overview at a glance */}
-
-
- {/* 2. Composite Score — primary analytical ranking */}
+ {/* 1. Composite Score — primary ranking: which model is best */}
+ {/* 2. Model x Test Heatmap — per-test breakdown after rank context */}
+
+
{/* 3. Core metrics: Pass Rate + Blind vs Informed */}
diff --git a/apps/dashboard/src/components/leaderboard/leaderboard-summary-cards.tsx b/apps/dashboard/src/components/leaderboard/leaderboard-summary-cards.tsx
index f0de93f..46f4487 100644
--- a/apps/dashboard/src/components/leaderboard/leaderboard-summary-cards.tsx
+++ b/apps/dashboard/src/components/leaderboard/leaderboard-summary-cards.tsx
@@ -27,7 +27,7 @@ const CARD_ACCENT_COLORS = [
"hsl(212, 100%, 67%)", // info blue — deduped items
"hsl(156, 67%, 55%)", // success green — pass rate
"hsl(270, 60%, 60%)", // purple — frontier coverage
- "hsl(210, 85%, 60%)", // blue — avg duration
+ "hsl(210, 85%, 60%)", // blue — median duration
];
/**
@@ -97,7 +97,7 @@ export function LeaderboardSummaryCards({
sub: `${frontierCount} of ${filteredItemCount} items`,
},
{
- title: "Avg Duration",
+ title: "Median Duration",
value: medianDuration !== null ? formatDuration(medianDuration) : "—",
sub: durations.length > 0 ? `${durations.length} items` : "no data",
},
diff --git a/apps/dashboard/src/components/run-detail/run-detail-page.tsx b/apps/dashboard/src/components/run-detail/run-detail-page.tsx
index cf4342c..b648c5a 100644
--- a/apps/dashboard/src/components/run-detail/run-detail-page.tsx
+++ b/apps/dashboard/src/components/run-detail/run-detail-page.tsx
@@ -224,29 +224,6 @@ export function RunDetailPage({ run, plan }: RunDetailPageProps) {
-
-
- {/* Matrix Table */}
-
-
- Results Matrix
-
-
-
-
-
-
- {/* Breakdowns */}
-
-
-
-
-
-
-
-
-
-
{/* Primary Chart - Composite Scores */}
+ {/* Breakdowns */}
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {/* Matrix Table */}
+
+
+ Results Matrix
+
+
+
+
+
diff --git a/memory/MEMORY.md b/memory/MEMORY.md
new file mode 100644
index 0000000..3b84107
--- /dev/null
+++ b/memory/MEMORY.md
@@ -0,0 +1,47 @@
+# plebdev-bench Memory
+
+## Project
+Local LLM benchmark runner — CLI-driven test harness + scoring pipeline.
+Stack: Bun + TypeScript, Zod, Vitest, Pino, Recharts (dashboard), React/Vite.
+
+## Key Paths
+- `src/` — CLI, runner, harnesses, runtimes, lib, schemas, tests
+- `apps/dashboard/` — React dashboard (Vite + Tailwind + Recharts)
+- `apps/dashboard/public/results/` — Published static run artifacts
+- `results/` — Raw timestamped run outputs (plan.json + run.json)
+- `llm/project/` — Canonical project docs (read before making changes)
+
+## Dashboard Architecture
+- Data source: static JSON files under `apps/dashboard/public/results/`
+ - `index.json` — DashboardIndex listing all runs (v3 schema)
+ - `/plan.json` + `/run.json` — Per-run artifacts
+ - `aggregates/latest.json` — Latest checkpoint aggregate for leaderboard
+ - `aggregates/.json` — Per-checkpoint aggregates
+- 4 pages: Leaderboard, Run List, Run Detail, About
+- Key lib files: `src/lib/api.ts`, `src/lib/types.ts`, `src/lib/aggregations*.ts`
+
+## Current Data State (as of 2026-03-30)
+- Latest checkpoint: `chk_sha256v1_432187a085f7`
+- 12 total runs indexed, 3 match latest checkpoint
+- 1512 deduped items, 1 machine (Apple M4 Pro / 64GB)
+- 24 models tested: qwen3.x, qwen2.5, devstral, gpt-oss:20b, etc.
+- 3 harnesses: direct, goose, opencode
+- 16 tests: smoke, calculator-*, todo-app, tool-smoke, rate-limiter, ttl-cache, event-emitter, workspace-*, file-*, targeted-edit, safe-cleanup
+- Top models: gpt-oss:20b (98.5%), qwen3.5:27b (97.1%), devstral-small-2:24b / qwen3:8b (94.6%)
+- Bottom: llama3.2:3b (18.9%), qwen3.5:0.8b (20.3%)
+
+## Composite Score Formula
+effectiveScore = passRate × 0.4 + completionRate × 0.3 + toolSuccessRate × 0.3
+
+## Checkpoint System
+- `src/lib/benchmark-checkpoint.ts` — SHA-256 hash of all test assets + core lib files
+- Checkpoints roll when test prompts, scoring specs, or core pipeline code changes
+- Leaderboard only shows runs matching the latest checkpoint
+
+## Aggregation / Dedup Logic
+- Best result per (machineInstanceId × matrixKey) across runs within a checkpoint
+- Published via `bun dashboard:index` command
+- Leaderboard uses `aggregates/latest.json` only
+
+## User Preferences
+- (none recorded yet)
diff --git a/src/lib/benchmark-checkpoint.ts b/src/lib/benchmark-checkpoint.ts
index 0801d3d..ccc0413 100644
--- a/src/lib/benchmark-checkpoint.ts
+++ b/src/lib/benchmark-checkpoint.ts
@@ -30,8 +30,13 @@ const CORE_BENCHMARK_LIB_ASSETS = [
"src/lib/benchmark-checkpoint.ts",
"src/lib/scorer.ts",
"src/lib/scorer-core.ts",
+ "src/lib/code-module-scorer.ts",
"src/lib/scorer-worker.ts",
"src/lib/scoring-spec.ts",
+ "src/lib/workspace-scorer.ts",
+ "src/lib/workspace-manifest.ts",
+ "src/lib/test-workspace.ts",
+ "src/lib/signal-assessment.ts",
"src/lib/code-extractor.ts",
"src/lib/stdout-suppressor.ts",
"src/lib/test-catalog.ts",
diff --git a/test/benchmark-checkpoint.test.ts b/test/benchmark-checkpoint.test.ts
index 6249ba4..acef21b 100644
--- a/test/benchmark-checkpoint.test.ts
+++ b/test/benchmark-checkpoint.test.ts
@@ -21,8 +21,13 @@ const REQUIRED_LIB_ASSETS = [
"benchmark-checkpoint.ts",
"scorer.ts",
"scorer-core.ts",
+ "code-module-scorer.ts",
"scorer-worker.ts",
"scoring-spec.ts",
+ "workspace-scorer.ts",
+ "workspace-manifest.ts",
+ "test-workspace.ts",
+ "signal-assessment.ts",
"code-extractor.ts",
"stdout-suppressor.ts",
"test-catalog.ts",
@@ -141,12 +146,33 @@ describe("benchmark checkpoint", () => {
expect(after.checkpointId).not.toBe(before.checkpointId);
});
+ it("changes manifest hash when signal assessment changes", () => {
+ const root = createBenchmarkRoot();
+ const before = computeBenchmarkCheckpoint(root);
+
+ const signalAssessmentPath = path.join(
+ root,
+ "src",
+ "lib",
+ "signal-assessment.ts",
+ );
+ fs.writeFileSync(
+ signalAssessmentPath,
+ 'export const signal_assessment_ts = "updated";\n',
+ );
+
+ const after = computeBenchmarkCheckpoint(root);
+ expect(after.manifestHash).not.toBe(before.manifestHash);
+ expect(after.checkpointId).not.toBe(before.checkpointId);
+ });
+
it("collects a deterministic sorted asset list", () => {
const root = createBenchmarkRoot();
const assets = collectBenchmarkAssetPaths(root);
expect(assets).toEqual([...assets].sort((a, b) => a.localeCompare(b)));
expect(assets).toContain("src/harnesses/direct-adapter.ts");
expect(assets).toContain("src/lib/scorer.ts");
+ expect(assets).toContain("src/lib/signal-assessment.ts");
expect(assets).toContain("src/tests/smoke/prompt.blind.md");
});
});
diff --git a/test/build-index.test.ts b/test/build-index.test.ts
index 5df0cf7..8624120 100644
--- a/test/build-index.test.ts
+++ b/test/build-index.test.ts
@@ -29,8 +29,13 @@ const REQUIRED_LIB_ASSETS = [
"benchmark-checkpoint.ts",
"scorer.ts",
"scorer-core.ts",
+ "code-module-scorer.ts",
"scorer-worker.ts",
"scoring-spec.ts",
+ "workspace-scorer.ts",
+ "workspace-manifest.ts",
+ "test-workspace.ts",
+ "signal-assessment.ts",
"code-extractor.ts",
"stdout-suppressor.ts",
"test-catalog.ts",