From b85d99b56bb0fbccdb0248e4eaf7d6536762300e Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 25 Feb 2026 12:56:15 +0000 Subject: [PATCH] feat(tooling): add benchmark summary report utility Closes #367 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- examples/features/benchmark-tooling/README.md | 42 +- .../scripts/benchmark-report.ts | 593 ++++++++++++++++++ 2 files changed, 634 insertions(+), 1 deletion(-) create mode 100644 examples/features/benchmark-tooling/scripts/benchmark-report.ts diff --git a/examples/features/benchmark-tooling/README.md b/examples/features/benchmark-tooling/README.md index aac7bec3d..30371fff1 100644 --- a/examples/features/benchmark-tooling/README.md +++ b/examples/features/benchmark-tooling/README.md @@ -152,6 +152,42 @@ bun examples/features/benchmark-tooling/scripts/significance-test.ts baseline.js - **Identical scores**: p-value = 1, not significant (correct behavior). - **< 2 pairs**: Cannot test; exits with code 1. +## benchmark-report + +Generates a consolidated benchmark summary across models and metrics from result JSONL files. Produces per-target aggregates (mean, std dev, median, pass rate, 95% CI) and per-metric breakdowns when evaluator-level scores are present. + +### Usage + +```bash +# Summarize all result files in a directory +bun examples/features/benchmark-tooling/scripts/benchmark-report.ts ./by-target/ + +# Summarize specific files +bun examples/features/benchmark-tooling/scripts/benchmark-report.ts results.gpt-4.1.jsonl results.claude-sonnet-4.jsonl + +# Machine-readable JSON output +bun examples/features/benchmark-tooling/scripts/benchmark-report.ts ./by-target/ --json + +# Sort by score (descending) and set custom pass threshold +bun examples/features/benchmark-tooling/scripts/benchmark-report.ts ./by-target/ --sort score --pass-threshold 0.7 +``` + +### Options + +| Flag | Default | Description | +|---|---|---| +| `--json` | — | Output machine-readable JSON only | +| `--sort ` | `name` | Sort targets by: `name`, `score`, `pass_rate` | +| `--pass-threshold ` | `0.5` | Score threshold to count as pass | + +### Output + +**Per-Target Summary** includes for each model: record count, mean score, standard deviation, median, min, max, pass rate, and 95% confidence interval. + +**Per-Target Metric Breakdown** appears when records contain evaluator-level `scores[]` arrays, showing mean and spread for each evaluator (e.g., accuracy, latency) per target. + +**Machine-readable JSON** output (`--json`) returns a structured `BenchmarkReport` object with `summary`, `per_target`, `per_target_metrics`, and `overall` fields. + ### End-to-End Workflow ```bash @@ -171,7 +207,11 @@ bun examples/features/benchmark-tooling/scripts/win-rate-summary.ts comparison.j bun examples/features/benchmark-tooling/scripts/significance-test.ts \ ./by-target/results.gpt-4.1.jsonl ./by-target/results.claude-sonnet-4.jsonl -# 6. CI gate: use JSON output for programmatic checks +# 6. Consolidated benchmark report +bun examples/features/benchmark-tooling/scripts/benchmark-report.ts ./by-target/ + +# 7. CI gate: use JSON output for programmatic checks +bun examples/features/benchmark-tooling/scripts/benchmark-report.ts ./by-target/ --json bun examples/features/benchmark-tooling/scripts/win-rate-summary.ts comparison.json --json bun examples/features/benchmark-tooling/scripts/significance-test.ts \ ./by-target/results.gpt-4.1.jsonl ./by-target/results.claude-sonnet-4.jsonl --json diff --git a/examples/features/benchmark-tooling/scripts/benchmark-report.ts b/examples/features/benchmark-tooling/scripts/benchmark-report.ts new file mode 100644 index 000000000..1d846d10f --- /dev/null +++ b/examples/features/benchmark-tooling/scripts/benchmark-report.ts @@ -0,0 +1,593 @@ +#!/usr/bin/env bun +/** + * benchmark-report — Consolidated benchmark summary across models and metrics. + * + * Reads multiple result JSONL files (one per model/target) and produces a + * summary report with per-target aggregates, per-metric breakdowns, and + * overall statistics including uncertainty measures. + * + * Usage: + * bun benchmark-report.ts [file2.jsonl ...] [options] + * bun benchmark-report.ts [options] + * + * Options: + * --json Output machine-readable JSON only + * --format Output format: "markdown" (default) or "json" + * --sort Sort targets by: "name" (default), "score", "pass_rate" + * --pass-threshold Score threshold to count as pass (default: 0.5) + */ + +import { readFileSync, readdirSync, statSync } from "node:fs"; +import { resolve, basename } from "node:path"; + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +interface ResultRecord { + test_id?: string; + eval_id?: string; + target?: string; + score: number; + scores?: EvaluatorScore[]; + trials?: TrialRecord[]; + aggregation?: AggregationInfo; +} + +interface EvaluatorScore { + name: string; + type?: string; + score: number; + weight?: number; + verdict?: string; +} + +interface TrialRecord { + attempt: number; + score: number; + verdict?: string; +} + +interface AggregationInfo { + strategy?: string; + mean?: number; + min?: number; + max?: number; + ci95_lower?: number; + ci95_upper?: number; + stddev?: number; + passed_attempts?: number; + total_attempts?: number; +} + +interface TargetStats { + target: string; + n: number; + mean_score: number; + std_dev: number; + min_score: number; + max_score: number; + median_score: number; + pass_count: number; + pass_rate: number; + ci95_lower: number | null; + ci95_upper: number | null; +} + +interface MetricStats { + metric: string; + n: number; + mean_score: number; + std_dev: number; + min_score: number; + max_score: number; +} + +interface TargetMetricStats { + target: string; + metrics: MetricStats[]; +} + +interface BenchmarkReport { + summary: { + total_records: number; + total_targets: number; + total_test_ids: number; + pass_threshold: number; + }; + per_target: TargetStats[]; + per_target_metrics: TargetMetricStats[] | null; + overall: TargetStats; +} + +// --------------------------------------------------------------------------- +// Statistics helpers +// --------------------------------------------------------------------------- + +function mean(values: number[]): number { + if (values.length === 0) return 0; + return values.reduce((a, b) => a + b, 0) / values.length; +} + +function stdDev(values: number[]): number { + if (values.length < 2) return 0; + const m = mean(values); + const variance = + values.reduce((sum, v) => sum + (v - m) ** 2, 0) / (values.length - 1); + return Math.sqrt(variance); +} + +function median(values: number[]): number { + if (values.length === 0) return 0; + const sorted = [...values].sort((a, b) => a - b); + const mid = Math.floor(sorted.length / 2); + return sorted.length % 2 !== 0 + ? sorted[mid] + : (sorted[mid - 1] + sorted[mid]) / 2; +} + +function ci95(values: number[]): { lower: number; upper: number } | null { + const n = values.length; + if (n < 2) return null; + const m = mean(values); + const se = stdDev(values) / Math.sqrt(n); + // t-approximation for 95% CI (use 1.96 for large n) + const t = n >= 30 ? 1.96 : tValue95(n - 1); + return { + lower: round(m - t * se, 6), + upper: round(m + t * se, 6), + }; +} + +/** Approximate t-value for 95% CI with small df. */ +function tValue95(df: number): number { + // Pre-computed t-values for small degrees of freedom (two-tailed 95%) + const table: Record = { + 1: 12.706, 2: 4.303, 3: 3.182, 4: 2.776, 5: 2.571, + 6: 2.447, 7: 2.365, 8: 2.306, 9: 2.262, 10: 2.228, + 15: 2.131, 20: 2.086, 25: 2.06, 29: 2.045, + }; + if (table[df]) return table[df]; + // Find nearest lower df in table + const keys = Object.keys(table).map(Number).sort((a, b) => a - b); + for (let i = keys.length - 1; i >= 0; i--) { + if (keys[i] <= df) return table[keys[i]]; + } + return 1.96; +} + +function round(value: number, decimals: number): number { + const factor = 10 ** decimals; + return Math.round(value * factor) / factor; +} + +// --------------------------------------------------------------------------- +// Core logic +// --------------------------------------------------------------------------- + +function readResultFile( + filePath: string, + fallbackTarget: string, +): ResultRecord[] { + const content = readFileSync(filePath, "utf-8"); + const lines = content.split("\n").filter((l) => l.trim().length > 0); + const records: ResultRecord[] = []; + + for (const line of lines) { + let raw: Record; + try { + raw = JSON.parse(line); + } catch { + continue; + } + if (typeof raw.score !== "number") continue; + + const record: ResultRecord = { + test_id: (raw.test_id ?? raw.eval_id) as string | undefined, + target: (raw.target as string) ?? fallbackTarget, + score: raw.score as number, + }; + + if (Array.isArray(raw.scores)) { + record.scores = raw.scores as EvaluatorScore[]; + } + if (Array.isArray(raw.trials)) { + record.trials = raw.trials as TrialRecord[]; + } + if (raw.aggregation && typeof raw.aggregation === "object") { + record.aggregation = raw.aggregation as AggregationInfo; + } + + records.push(record); + } + + return records; +} + +function loadRecords(inputPaths: string[]): ResultRecord[] { + const all: ResultRecord[] = []; + + for (const inputPath of inputPaths) { + const stat = statSync(inputPath); + + if (stat.isDirectory()) { + const files = readdirSync(inputPath) + .filter((f) => f.endsWith(".jsonl")) + .sort(); + if (files.length === 0) { + console.error(`Warning: no .jsonl files found in ${inputPath}`); + continue; + } + for (const f of files) { + const fullPath = resolve(inputPath, f); + const fallback = basename(f, ".jsonl").replace(/^results\./, ""); + all.push(...readResultFile(fullPath, fallback)); + } + } else { + const fallback = basename(inputPath, ".jsonl").replace(/^results\./, ""); + all.push(...readResultFile(inputPath, fallback)); + } + } + + return all; +} + +function computeTargetStats( + target: string, + records: ResultRecord[], + passThreshold: number, +): TargetStats { + const scores = records.map((r) => r.score); + const passCount = scores.filter((s) => s >= passThreshold).length; + const confidence = ci95(scores); + + // Check for trial-level uncertainty + let trialCiLower: number | null = null; + let trialCiUpper: number | null = null; + for (const r of records) { + if (r.aggregation?.ci95_lower != null && r.aggregation?.ci95_upper != null) { + trialCiLower = r.aggregation.ci95_lower; + trialCiUpper = r.aggregation.ci95_upper; + break; // Use first available as representative + } + } + + return { + target, + n: scores.length, + mean_score: round(mean(scores), 4), + std_dev: round(stdDev(scores), 4), + min_score: round(Math.min(...scores), 4), + max_score: round(Math.max(...scores), 4), + median_score: round(median(scores), 4), + pass_count: passCount, + pass_rate: round(scores.length > 0 ? passCount / scores.length : 0, 4), + ci95_lower: confidence?.lower ?? trialCiLower, + ci95_upper: confidence?.upper ?? trialCiUpper, + }; +} + +function computeMetricStats( + records: ResultRecord[], +): Map | null { + const metricScores = new Map(); + let hasMetrics = false; + + for (const r of records) { + if (!r.scores || !Array.isArray(r.scores)) continue; + for (const s of r.scores) { + if (!s.name || typeof s.score !== "number") continue; + hasMetrics = true; + const existing = metricScores.get(s.name) ?? []; + existing.push(s.score); + metricScores.set(s.name, existing); + } + } + + return hasMetrics ? metricScores : null; +} + +function buildReport( + records: ResultRecord[], + passThreshold: number, + sortBy: string, +): BenchmarkReport { + // Group by target + const byTarget = new Map(); + const testIds = new Set(); + + for (const r of records) { + const target = r.target ?? "unknown"; + const group = byTarget.get(target) ?? []; + group.push(r); + byTarget.set(target, group); + if (r.test_id) testIds.add(r.test_id); + } + + // Per-target stats + const perTarget: TargetStats[] = []; + for (const [target, recs] of byTarget) { + perTarget.push(computeTargetStats(target, recs, passThreshold)); + } + + // Sort + switch (sortBy) { + case "score": + perTarget.sort((a, b) => b.mean_score - a.mean_score); + break; + case "pass_rate": + perTarget.sort((a, b) => b.pass_rate - a.pass_rate); + break; + default: + perTarget.sort((a, b) => a.target.localeCompare(b.target)); + } + + // Per-target per-metric stats + let perTargetMetrics: TargetMetricStats[] | null = null; + const anyMetrics = records.some( + (r) => r.scores && Array.isArray(r.scores) && r.scores.length > 0, + ); + + if (anyMetrics) { + perTargetMetrics = []; + for (const [target, recs] of byTarget) { + const metricMap = computeMetricStats(recs); + if (!metricMap) continue; + + const metrics: MetricStats[] = []; + for (const [name, scores] of metricMap) { + metrics.push({ + metric: name, + n: scores.length, + mean_score: round(mean(scores), 4), + std_dev: round(stdDev(scores), 4), + min_score: round(Math.min(...scores), 4), + max_score: round(Math.max(...scores), 4), + }); + } + metrics.sort((a, b) => a.metric.localeCompare(b.metric)); + perTargetMetrics.push({ target, metrics }); + } + // Sort to match per_target order + const targetOrder = new Map(perTarget.map((t, i) => [t.target, i])); + perTargetMetrics.sort( + (a, b) => (targetOrder.get(a.target) ?? 0) - (targetOrder.get(b.target) ?? 0), + ); + } + + // Overall stats + const overall = computeTargetStats("overall", records, passThreshold); + + return { + summary: { + total_records: records.length, + total_targets: byTarget.size, + total_test_ids: testIds.size, + pass_threshold: passThreshold, + }, + per_target: perTarget, + per_target_metrics: perTargetMetrics, + overall, + }; +} + +// --------------------------------------------------------------------------- +// Output formatters +// --------------------------------------------------------------------------- + +function pct(rate: number): string { + return `${(rate * 100).toFixed(1)}%`; +} + +function pad(str: string, width: number): string { + return str.length >= width ? str : str + " ".repeat(width - str.length); +} + +function padLeft(str: string, width: number): string { + return str.length >= width ? str : " ".repeat(width - str.length) + str; +} + +function formatCI(lower: number | null, upper: number | null): string { + if (lower == null || upper == null) return "—"; + return `[${lower.toFixed(4)}, ${upper.toFixed(4)}]`; +} + +function printMarkdown(report: BenchmarkReport): void { + const divider = "─".repeat(80); + + console.log(`\n${divider}`); + console.log(" Benchmark Report"); + console.log(divider); + console.log( + ` Records: ${report.summary.total_records} | Targets: ${report.summary.total_targets} | Test IDs: ${report.summary.total_test_ids} | Pass threshold: ${report.summary.pass_threshold}`, + ); + console.log(divider); + + // Per-target table + console.log("\n## Per-Target Summary\n"); + console.log( + [ + pad("Target", 22), + padLeft("N", 5), + padLeft("Mean", 8), + padLeft("Std", 8), + padLeft("Med", 8), + padLeft("Min", 8), + padLeft("Max", 8), + padLeft("Pass%", 8), + pad(" 95% CI", 24), + ].join(""), + ); + console.log("─".repeat(99)); + + for (const t of report.per_target) { + console.log( + [ + pad(t.target.slice(0, 21), 22), + padLeft(String(t.n), 5), + padLeft(t.mean_score.toFixed(4), 8), + padLeft(t.std_dev.toFixed(4), 8), + padLeft(t.median_score.toFixed(4), 8), + padLeft(t.min_score.toFixed(4), 8), + padLeft(t.max_score.toFixed(4), 8), + padLeft(pct(t.pass_rate), 8), + ` ${formatCI(t.ci95_lower, t.ci95_upper)}`, + ].join(""), + ); + } + + console.log("─".repeat(99)); + + // Overall row + const o = report.overall; + console.log( + [ + pad("overall", 22), + padLeft(String(o.n), 5), + padLeft(o.mean_score.toFixed(4), 8), + padLeft(o.std_dev.toFixed(4), 8), + padLeft(o.median_score.toFixed(4), 8), + padLeft(o.min_score.toFixed(4), 8), + padLeft(o.max_score.toFixed(4), 8), + padLeft(pct(o.pass_rate), 8), + ` ${formatCI(o.ci95_lower, o.ci95_upper)}`, + ].join(""), + ); + + // Per-target per-metric breakdown + if (report.per_target_metrics && report.per_target_metrics.length > 0) { + console.log("\n## Per-Target Metric Breakdown\n"); + + for (const tm of report.per_target_metrics) { + console.log(`### ${tm.target}\n`); + console.log( + [ + pad("Metric", 22), + padLeft("N", 5), + padLeft("Mean", 8), + padLeft("Std", 8), + padLeft("Min", 8), + padLeft("Max", 8), + ].join(""), + ); + console.log("─".repeat(59)); + + for (const m of tm.metrics) { + console.log( + [ + pad(m.metric.slice(0, 21), 22), + padLeft(String(m.n), 5), + padLeft(m.mean_score.toFixed(4), 8), + padLeft(m.std_dev.toFixed(4), 8), + padLeft(m.min_score.toFixed(4), 8), + padLeft(m.max_score.toFixed(4), 8), + ].join(""), + ); + } + console.log(); + } + } + + console.log(divider); +} + +// --------------------------------------------------------------------------- +// CLI +// --------------------------------------------------------------------------- + +function main(): void { + const args = process.argv.slice(2); + + if (args.length === 0 || args.includes("--help") || args.includes("-h")) { + console.log( + `Usage: bun benchmark-report.ts [options] + bun benchmark-report.ts [options] + +Generates a consolidated benchmark summary from multiple result JSONL files. +Aggregates per-target (model) and per-metric statistics with uncertainty. + +Options: + --json Output machine-readable JSON only + --sort Sort targets by: "name" (default), "score", "pass_rate" + --pass-threshold Score threshold to count as pass (default: 0.5) + +Input: + One or more .jsonl result files, or a directory containing .jsonl files. + Each record must have a "score" field. The "target" field identifies + the model; if absent, the filename is used as fallback. + +Examples: + bun benchmark-report.ts results.gpt-4.1.jsonl results.claude-sonnet-4.jsonl + bun benchmark-report.ts ./by-target/ + bun benchmark-report.ts ./by-target/ --json --sort score + bun benchmark-report.ts ./by-target/ --pass-threshold 0.7`, + ); + process.exit(0); + } + + // Parse CLI args + let jsonOutput = false; + let sortBy = "name"; + let passThreshold = 0.5; + const inputPaths: string[] = []; + + for (let i = 0; i < args.length; i++) { + switch (args[i]) { + case "--json": + jsonOutput = true; + break; + case "--format": + case "-f": { + const fmt = args[++i]; + if (fmt === "json") jsonOutput = true; + break; + } + case "--sort": + case "-s": + sortBy = args[++i]; + if (!["name", "score", "pass_rate"].includes(sortBy)) { + console.error( + 'Error: --sort must be one of: "name", "score", "pass_rate"', + ); + process.exit(1); + } + break; + case "--pass-threshold": + case "-p": { + const val = Number.parseFloat(args[++i]); + if (Number.isNaN(val) || val < 0 || val > 1) { + console.error("Error: --pass-threshold must be between 0 and 1"); + process.exit(1); + } + passThreshold = val; + break; + } + default: + inputPaths.push(resolve(args[i])); + } + } + + if (inputPaths.length === 0) { + console.error("Error: no input files or directories specified."); + process.exit(1); + } + + // Load all records + const records = loadRecords(inputPaths); + + if (records.length === 0) { + console.error("Error: no valid result records found in input files."); + process.exit(1); + } + + // Build report + const report = buildReport(records, passThreshold, sortBy); + + if (jsonOutput) { + console.log(JSON.stringify(report, null, 2)); + } else { + printMarkdown(report); + } +} + +main();