Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
* Invariants:
* - Renders aggregate charts before filter/table sections
* - Uses filtered aggregate items so charts stay aligned with visible leaderboard scope
* - Hierarchy: heatmapcomposite → pass+blind → radar+token → failure+difficulty → timing+frontier → head-to-head
* - Hierarchy: compositeheatmap → pass+blind → radar+token → failure+difficulty → timing+frontier → head-to-head
*/

import { BlindVsInformedChart } from "@/components/charts/blind-vs-informed-chart";
Expand Down Expand Up @@ -44,22 +44,21 @@ export function LeaderboardChartGallery({
Aggregate Charts
</h2>
<p className="mt-1 text-sm text-foreground-muted">
The leaderboard now opens on the aggregate signal: rank, prompt
delta, timing, frontier alignment, and pass-rate breakdowns across
the latest checkpoint.
Aggregate signals for the latest checkpoint: model rankings, prompt
delta, timing, frontier alignment, and pass-rate breakdowns.
</p>
</div>
<Badge variant="secondary" className="w-fit">
{items.length} filtered items
</Badge>
</div>

{/* 1. Hero: Model x Test Heatmap — full overview at a glance */}
<ModelTestHeatmap items={items} />

{/* 2. Composite Score — primary analytical ranking */}
{/* 1. Composite Score — primary ranking: which model is best */}
<CompositeScoreChart items={items} />

{/* 2. Model x Test Heatmap — per-test breakdown after rank context */}
<ModelTestHeatmap items={items} />

{/* 3. Core metrics: Pass Rate + Blind vs Informed */}
<div className="grid gap-4 xl:grid-cols-2">
<PassRateChart items={items} />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ const CARD_ACCENT_COLORS = [
"hsl(212, 100%, 67%)", // info blue — deduped items
"hsl(156, 67%, 55%)", // success green — pass rate
"hsl(270, 60%, 60%)", // purple — frontier coverage
"hsl(210, 85%, 60%)", // blue — avg duration
"hsl(210, 85%, 60%)", // blue — median duration
];

/**
Expand Down Expand Up @@ -97,7 +97,7 @@ export function LeaderboardSummaryCards({
sub: `${frontierCount} of ${filteredItemCount} items`,
},
{
title: "Avg Duration",
title: "Median Duration",
value: medianDuration !== null ? formatDuration(medianDuration) : "—",
sub: durations.length > 0 ? `${durations.length} items` : "no data",
},
Expand Down
46 changes: 23 additions & 23 deletions apps/dashboard/src/components/run-detail/run-detail-page.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -224,29 +224,6 @@ export function RunDetailPage({ run, plan }: RunDetailPageProps) {
</TabsList>

<TabsContent value="overview" className="mt-4 space-y-6">
<CoverageDiagnostics run={run} plan={plan} />

{/* Matrix Table */}
<Card>
<CardHeader>
<CardTitle className="text-base">Results Matrix</CardTitle>
</CardHeader>
<CardContent>
<MatrixTable items={run.items} onRowClick={setSelectedItem} />
</CardContent>
</Card>

{/* Breakdowns */}
<div className="grid gap-4 md:grid-cols-2">
<ScoringBreakdown items={run.items} />
<ToolingBreakdown items={run.items} />
</div>

<div className="grid gap-4 md:grid-cols-2">
<TimingStats items={run.items} />
<FailureBreakdown items={run.items} />
</div>

{/* Primary Chart - Composite Scores */}
<CompositeScoreChart
items={run.items}
Expand All @@ -261,7 +238,30 @@ export function RunDetailPage({ run, plan }: RunDetailPageProps) {
<TimingDistribution items={run.items} />
</div>

{/* Breakdowns */}
<div className="grid gap-4 md:grid-cols-2">
<ScoringBreakdown items={run.items} />
<ToolingBreakdown items={run.items} />
</div>

<div className="grid gap-4 md:grid-cols-2">
<TimingStats items={run.items} />
<FailureBreakdown items={run.items} />
</div>

<FrontierEvalScatter items={run.items} />

<CoverageDiagnostics run={run} plan={plan} />

{/* Matrix Table */}
<Card>
<CardHeader>
<CardTitle className="text-base">Results Matrix</CardTitle>
</CardHeader>
<CardContent>
<MatrixTable items={run.items} onRowClick={setSelectedItem} />
</CardContent>
</Card>
</TabsContent>

<TabsContent value="model" className="mt-4">
Expand Down
47 changes: 47 additions & 0 deletions memory/MEMORY.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# plebdev-bench Memory

## Project
Local LLM benchmark runner — CLI-driven test harness + scoring pipeline.
Stack: Bun + TypeScript, Zod, Vitest, Pino, Recharts (dashboard), React/Vite.

## Key Paths
- `src/` — CLI, runner, harnesses, runtimes, lib, schemas, tests
- `apps/dashboard/` — React dashboard (Vite + Tailwind + Recharts)
- `apps/dashboard/public/results/` — Published static run artifacts
- `results/` — Raw timestamped run outputs (plan.json + run.json)
- `llm/project/` — Canonical project docs (read before making changes)

## Dashboard Architecture
- Data source: static JSON files under `apps/dashboard/public/results/`
- `index.json` — DashboardIndex listing all runs (v3 schema)
- `<runId>/plan.json` + `<runId>/run.json` — Per-run artifacts
- `aggregates/latest.json` — Latest checkpoint aggregate for leaderboard
- `aggregates/<checkpointId>.json` — Per-checkpoint aggregates
- 4 pages: Leaderboard, Run List, Run Detail, About
- Key lib files: `src/lib/api.ts`, `src/lib/types.ts`, `src/lib/aggregations*.ts`

## Current Data State (as of 2026-03-30)
- Latest checkpoint: `chk_sha256v1_432187a085f7`
- 12 total runs indexed, 3 match latest checkpoint
- 1512 deduped items, 1 machine (Apple M4 Pro / 64GB)
- 24 models tested: qwen3.x, qwen2.5, devstral, gpt-oss:20b, etc.
- 3 harnesses: direct, goose, opencode
- 16 tests: smoke, calculator-*, todo-app, tool-smoke, rate-limiter, ttl-cache, event-emitter, workspace-*, file-*, targeted-edit, safe-cleanup
- Top models: gpt-oss:20b (98.5%), qwen3.5:27b (97.1%), devstral-small-2:24b / qwen3:8b (94.6%)
- Bottom: llama3.2:3b (18.9%), qwen3.5:0.8b (20.3%)

## Composite Score Formula
effectiveScore = passRate × 0.4 + completionRate × 0.3 + toolSuccessRate × 0.3

## Checkpoint System
- `src/lib/benchmark-checkpoint.ts` — SHA-256 hash of all test assets + core lib files
- Checkpoints roll when test prompts, scoring specs, or core pipeline code changes
- Leaderboard only shows runs matching the latest checkpoint

## Aggregation / Dedup Logic
- Best result per (machineInstanceId × matrixKey) across runs within a checkpoint
- Published via `bun dashboard:index` command
- Leaderboard uses `aggregates/latest.json` only

## User Preferences
- (none recorded yet)
5 changes: 5 additions & 0 deletions src/lib/benchmark-checkpoint.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,13 @@ const CORE_BENCHMARK_LIB_ASSETS = [
"src/lib/benchmark-checkpoint.ts",
"src/lib/scorer.ts",
"src/lib/scorer-core.ts",
"src/lib/code-module-scorer.ts",
"src/lib/scorer-worker.ts",
"src/lib/scoring-spec.ts",
"src/lib/workspace-scorer.ts",
"src/lib/workspace-manifest.ts",
"src/lib/test-workspace.ts",
"src/lib/signal-assessment.ts",
"src/lib/code-extractor.ts",
"src/lib/stdout-suppressor.ts",
"src/lib/test-catalog.ts",
Expand Down
26 changes: 26 additions & 0 deletions test/benchmark-checkpoint.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,13 @@ const REQUIRED_LIB_ASSETS = [
"benchmark-checkpoint.ts",
"scorer.ts",
"scorer-core.ts",
"code-module-scorer.ts",
"scorer-worker.ts",
"scoring-spec.ts",
"workspace-scorer.ts",
"workspace-manifest.ts",
"test-workspace.ts",
"signal-assessment.ts",
"code-extractor.ts",
"stdout-suppressor.ts",
"test-catalog.ts",
Expand Down Expand Up @@ -141,12 +146,33 @@ describe("benchmark checkpoint", () => {
expect(after.checkpointId).not.toBe(before.checkpointId);
});

it("changes manifest hash when signal assessment changes", () => {
const root = createBenchmarkRoot();
const before = computeBenchmarkCheckpoint(root);

const signalAssessmentPath = path.join(
root,
"src",
"lib",
"signal-assessment.ts",
);
fs.writeFileSync(
signalAssessmentPath,
'export const signal_assessment_ts = "updated";\n',
);

const after = computeBenchmarkCheckpoint(root);
expect(after.manifestHash).not.toBe(before.manifestHash);
expect(after.checkpointId).not.toBe(before.checkpointId);
});

it("collects a deterministic sorted asset list", () => {
const root = createBenchmarkRoot();
const assets = collectBenchmarkAssetPaths(root);
expect(assets).toEqual([...assets].sort((a, b) => a.localeCompare(b)));
expect(assets).toContain("src/harnesses/direct-adapter.ts");
expect(assets).toContain("src/lib/scorer.ts");
expect(assets).toContain("src/lib/signal-assessment.ts");
expect(assets).toContain("src/tests/smoke/prompt.blind.md");
});
});
5 changes: 5 additions & 0 deletions test/build-index.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,13 @@ const REQUIRED_LIB_ASSETS = [
"benchmark-checkpoint.ts",
"scorer.ts",
"scorer-core.ts",
"code-module-scorer.ts",
"scorer-worker.ts",
"scoring-spec.ts",
"workspace-scorer.ts",
"workspace-manifest.ts",
"test-workspace.ts",
"signal-assessment.ts",
"code-extractor.ts",
"stdout-suppressor.ts",
"test-catalog.ts",
Expand Down