AustinKelsay · AustinKelsay · Mar 30, 2026 · Mar 30, 2026
diff --git a/apps/dashboard/src/components/leaderboard/leaderboard-chart-gallery.tsx b/apps/dashboard/src/components/leaderboard/leaderboard-chart-gallery.tsx
@@ -5,7 +5,7 @@
  * Invariants:
  * - Renders aggregate charts before filter/table sections
  * - Uses filtered aggregate items so charts stay aligned with visible leaderboard scope
- * - Hierarchy: heatmap → composite → pass+blind → radar+token → failure+difficulty → timing+frontier → head-to-head
+ * - Hierarchy: composite → heatmap → pass+blind → radar+token → failure+difficulty → timing+frontier → head-to-head
  */
 
 import { BlindVsInformedChart } from "@/components/charts/blind-vs-informed-chart";
@@ -44,22 +44,21 @@ export function LeaderboardChartGallery({
 						Aggregate Charts
 					</h2>
 					<p className="mt-1 text-sm text-foreground-muted">
-						The leaderboard now opens on the aggregate signal: rank, prompt
-						delta, timing, frontier alignment, and pass-rate breakdowns across
-						the latest checkpoint.
+						Aggregate signals for the latest checkpoint: model rankings, prompt
+						delta, timing, frontier alignment, and pass-rate breakdowns.
 					</p>
 				</div>
 				<Badge variant="secondary" className="w-fit">
 					{items.length} filtered items
 				</Badge>
 			</div>
 
-			{/* 1. Hero: Model x Test Heatmap — full overview at a glance */}
-			<ModelTestHeatmap items={items} />
-
-			{/* 2. Composite Score — primary analytical ranking */}
+			{/* 1. Composite Score — primary ranking: which model is best */}
 			<CompositeScoreChart items={items} />
 
+			{/* 2. Model x Test Heatmap — per-test breakdown after rank context */}
+			<ModelTestHeatmap items={items} />
+
 			{/* 3. Core metrics: Pass Rate + Blind vs Informed */}
 			<div className="grid gap-4 xl:grid-cols-2">
 				<PassRateChart items={items} />

diff --git a/apps/dashboard/src/components/leaderboard/leaderboard-summary-cards.tsx b/apps/dashboard/src/components/leaderboard/leaderboard-summary-cards.tsx
@@ -27,7 +27,7 @@ const CARD_ACCENT_COLORS = [
 	"hsl(212, 100%, 67%)", // info blue — deduped items
 	"hsl(156, 67%, 55%)",  // success green — pass rate
 	"hsl(270, 60%, 60%)",  // purple — frontier coverage
-	"hsl(210, 85%, 60%)",  // blue — avg duration
+	"hsl(210, 85%, 60%)",  // blue — median duration
 ];
 
 /**
@@ -97,7 +97,7 @@ export function LeaderboardSummaryCards({
 			sub: `${frontierCount} of ${filteredItemCount} items`,
 		},
 		{
-			title: "Avg Duration",
+			title: "Median Duration",
 			value: medianDuration !== null ? formatDuration(medianDuration) : "—",
 			sub: durations.length > 0 ? `${durations.length} items` : "no data",
 		},

diff --git a/apps/dashboard/src/components/run-detail/run-detail-page.tsx b/apps/dashboard/src/components/run-detail/run-detail-page.tsx
@@ -224,29 +224,6 @@ export function RunDetailPage({ run, plan }: RunDetailPageProps) {
 				</TabsList>
 
 				<TabsContent value="overview" className="mt-4 space-y-6">
-					<CoverageDiagnostics run={run} plan={plan} />
-
-					{/* Matrix Table */}
-					<Card>
-						<CardHeader>
-							<CardTitle className="text-base">Results Matrix</CardTitle>
-						</CardHeader>
-						<CardContent>
-							<MatrixTable items={run.items} onRowClick={setSelectedItem} />
-						</CardContent>
-					</Card>
-
-					{/* Breakdowns */}
-					<div className="grid gap-4 md:grid-cols-2">
-						<ScoringBreakdown items={run.items} />
-						<ToolingBreakdown items={run.items} />
-					</div>
-
-					<div className="grid gap-4 md:grid-cols-2">
-						<TimingStats items={run.items} />
-						<FailureBreakdown items={run.items} />
-					</div>
-
 					{/* Primary Chart - Composite Scores */}
 					<CompositeScoreChart
 						items={run.items}
@@ -261,7 +238,30 @@ export function RunDetailPage({ run, plan }: RunDetailPageProps) {
 						<TimingDistribution items={run.items} />
 					</div>
 
+					{/* Breakdowns */}
+					<div className="grid gap-4 md:grid-cols-2">
+						<ScoringBreakdown items={run.items} />
+						<ToolingBreakdown items={run.items} />
+					</div>
+
+					<div className="grid gap-4 md:grid-cols-2">
+						<TimingStats items={run.items} />
+						<FailureBreakdown items={run.items} />
+					</div>
+
 					<FrontierEvalScatter items={run.items} />
+
+					<CoverageDiagnostics run={run} plan={plan} />
+
+					{/* Matrix Table */}
+					<Card>
+						<CardHeader>
+							<CardTitle className="text-base">Results Matrix</CardTitle>
+						</CardHeader>
+						<CardContent>
+							<MatrixTable items={run.items} onRowClick={setSelectedItem} />
+						</CardContent>
+					</Card>
 				</TabsContent>
 
 				<TabsContent value="model" className="mt-4">

diff --git a/memory/MEMORY.md b/memory/MEMORY.md
@@ -0,0 +1,47 @@
+# plebdev-bench Memory
+
+## Project
+Local LLM benchmark runner — CLI-driven test harness + scoring pipeline.
+Stack: Bun + TypeScript, Zod, Vitest, Pino, Recharts (dashboard), React/Vite.
+
+## Key Paths
+- `src/` — CLI, runner, harnesses, runtimes, lib, schemas, tests
+- `apps/dashboard/` — React dashboard (Vite + Tailwind + Recharts)
+- `apps/dashboard/public/results/` — Published static run artifacts
+- `results/` — Raw timestamped run outputs (plan.json + run.json)
+- `llm/project/` — Canonical project docs (read before making changes)
+
+## Dashboard Architecture
+- Data source: static JSON files under `apps/dashboard/public/results/`
+  - `index.json` — DashboardIndex listing all runs (v3 schema)
+  - `<runId>/plan.json` + `<runId>/run.json` — Per-run artifacts
+  - `aggregates/latest.json` — Latest checkpoint aggregate for leaderboard
+  - `aggregates/<checkpointId>.json` — Per-checkpoint aggregates
+- 4 pages: Leaderboard, Run List, Run Detail, About
+- Key lib files: `src/lib/api.ts`, `src/lib/types.ts`, `src/lib/aggregations*.ts`
+
+## Current Data State (as of 2026-03-30)
+- Latest checkpoint: `chk_sha256v1_432187a085f7`
+- 12 total runs indexed, 3 match latest checkpoint
+- 1512 deduped items, 1 machine (Apple M4 Pro / 64GB)
+- 24 models tested: qwen3.x, qwen2.5, devstral, gpt-oss:20b, etc.
+- 3 harnesses: direct, goose, opencode
+- 16 tests: smoke, calculator-*, todo-app, tool-smoke, rate-limiter, ttl-cache, event-emitter, workspace-*, file-*, targeted-edit, safe-cleanup
+- Top models: gpt-oss:20b (98.5%), qwen3.5:27b (97.1%), devstral-small-2:24b / qwen3:8b (94.6%)
+- Bottom: llama3.2:3b (18.9%), qwen3.5:0.8b (20.3%)
+
+## Composite Score Formula
+effectiveScore = passRate × 0.4 + completionRate × 0.3 + toolSuccessRate × 0.3
+
+## Checkpoint System
+- `src/lib/benchmark-checkpoint.ts` — SHA-256 hash of all test assets + core lib files
+- Checkpoints roll when test prompts, scoring specs, or core pipeline code changes
+- Leaderboard only shows runs matching the latest checkpoint
+
+## Aggregation / Dedup Logic
+- Best result per (machineInstanceId × matrixKey) across runs within a checkpoint
+- Published via `bun dashboard:index` command
+- Leaderboard uses `aggregates/latest.json` only
+
+## User Preferences
+- (none recorded yet)
diff --git a/src/lib/benchmark-checkpoint.ts b/src/lib/benchmark-checkpoint.ts
@@ -30,8 +30,13 @@ const CORE_BENCHMARK_LIB_ASSETS = [
 	"src/lib/benchmark-checkpoint.ts",
 	"src/lib/scorer.ts",
 	"src/lib/scorer-core.ts",
+	"src/lib/code-module-scorer.ts",
 	"src/lib/scorer-worker.ts",
 	"src/lib/scoring-spec.ts",
+	"src/lib/workspace-scorer.ts",
+	"src/lib/workspace-manifest.ts",
+	"src/lib/test-workspace.ts",
+	"src/lib/signal-assessment.ts",
 	"src/lib/code-extractor.ts",
 	"src/lib/stdout-suppressor.ts",
 	"src/lib/test-catalog.ts",

diff --git a/test/benchmark-checkpoint.test.ts b/test/benchmark-checkpoint.test.ts
@@ -21,8 +21,13 @@ const REQUIRED_LIB_ASSETS = [
 	"benchmark-checkpoint.ts",
 	"scorer.ts",
 	"scorer-core.ts",
+	"code-module-scorer.ts",
 	"scorer-worker.ts",
 	"scoring-spec.ts",
+	"workspace-scorer.ts",
+	"workspace-manifest.ts",
+	"test-workspace.ts",
+	"signal-assessment.ts",
 	"code-extractor.ts",
 	"stdout-suppressor.ts",
 	"test-catalog.ts",
@@ -141,12 +146,33 @@ describe("benchmark checkpoint", () => {
 		expect(after.checkpointId).not.toBe(before.checkpointId);
 	});
 
+	it("changes manifest hash when signal assessment changes", () => {
+		const root = createBenchmarkRoot();
+		const before = computeBenchmarkCheckpoint(root);
+
+		const signalAssessmentPath = path.join(
+			root,
+			"src",
+			"lib",
+			"signal-assessment.ts",
+		);
+		fs.writeFileSync(
+			signalAssessmentPath,
+			'export const signal_assessment_ts = "updated";\n',
+		);
+
+		const after = computeBenchmarkCheckpoint(root);
+		expect(after.manifestHash).not.toBe(before.manifestHash);
+		expect(after.checkpointId).not.toBe(before.checkpointId);
+	});
+
 	it("collects a deterministic sorted asset list", () => {
 		const root = createBenchmarkRoot();
 		const assets = collectBenchmarkAssetPaths(root);
 		expect(assets).toEqual([...assets].sort((a, b) => a.localeCompare(b)));
 		expect(assets).toContain("src/harnesses/direct-adapter.ts");
 		expect(assets).toContain("src/lib/scorer.ts");
+		expect(assets).toContain("src/lib/signal-assessment.ts");
 		expect(assets).toContain("src/tests/smoke/prompt.blind.md");
 	});
 });
diff --git a/test/build-index.test.ts b/test/build-index.test.ts
@@ -29,8 +29,13 @@ const REQUIRED_LIB_ASSETS = [
 	"benchmark-checkpoint.ts",
 	"scorer.ts",
 	"scorer-core.ts",
+	"code-module-scorer.ts",
 	"scorer-worker.ts",
 	"scoring-spec.ts",
+	"workspace-scorer.ts",
+	"workspace-manifest.ts",
+	"test-workspace.ts",
+	"signal-assessment.ts",
 	"code-extractor.ts",
 	"stdout-suppressor.ts",
 	"test-catalog.ts",