diff --git a/examples/features/benchmark-tooling/README.md b/examples/features/benchmark-tooling/README.md new file mode 100644 index 000000000..69adf51d7 --- /dev/null +++ b/examples/features/benchmark-tooling/README.md @@ -0,0 +1,54 @@ +# Benchmark Tooling + +Utilities for multi-model benchmarking workflows with AgentV. + +## split-by-target + +Splits a combined results JSONL file into one file per `target`, enabling pairwise comparison with `agentv compare`. + +### Usage + +```bash +# Split into the same directory as the input file +bun examples/features/benchmark-tooling/scripts/split-by-target.ts results.jsonl + +# Split into a specific output directory +bun examples/features/benchmark-tooling/scripts/split-by-target.ts results.jsonl ./split-output +``` + +Given a combined `results.jsonl` containing records for targets `gpt-4.1` and `claude-sonnet-4`: + +``` +results.gpt-4.1.jsonl (records where target == "gpt-4.1") +results.claude-sonnet-4.jsonl (records where target == "claude-sonnet-4") +``` + +### Filename Normalization + +Target names are normalized for safe filenames: + +| Target value | Output filename | +|---|---| +| `gpt-4.1` | `results.gpt-4.1.jsonl` | +| `Claude Sonnet 4` | `results.claude-sonnet-4.jsonl` | +| `azure/gpt-4o` | `results.azure-gpt-4o.jsonl` | + +### Downstream Compare Workflow + +After splitting, use `agentv compare` to perform pairwise model comparisons: + +```bash +# 1. Run a matrix evaluation that produces a combined results file +bun agentv eval my-eval.yaml + +# 2. Split results by target +bun examples/features/benchmark-tooling/scripts/split-by-target.ts results.jsonl ./by-target + +# 3. Compare any two targets +bun agentv compare ./by-target/results.gpt-4.1.jsonl ./by-target/results.claude-sonnet-4.jsonl + +# 4. JSON output for CI pipelines +bun agentv compare ./by-target/results.gpt-4.1.jsonl ./by-target/results.claude-sonnet-4.jsonl --json +``` + +The `compare` command matches records by `test_id`, calculates score deltas, and classifies each as win/loss/tie. It exits non-zero on regressions, making it suitable for CI gates. diff --git a/examples/features/benchmark-tooling/scripts/split-by-target.ts b/examples/features/benchmark-tooling/scripts/split-by-target.ts new file mode 100644 index 000000000..1a33f2918 --- /dev/null +++ b/examples/features/benchmark-tooling/scripts/split-by-target.ts @@ -0,0 +1,86 @@ +#!/usr/bin/env bun +/** + * split-by-target — Split a combined results JSONL file into one file per target. + * + * Usage: + * bun examples/features/benchmark-tooling/scripts/split-by-target.ts [output-dir] + * + * Output directory defaults to the same directory as the input file. + */ + +import { readFileSync, writeFileSync, mkdirSync, existsSync } from "node:fs"; +import { resolve, dirname, basename } from "node:path"; + +function normalizeTargetName(target: string): string { + return target + .trim() + .toLowerCase() + .replace(/[^a-z0-9._-]+/g, "-") // replace unsafe chars with hyphens + .replace(/-+/g, "-") // collapse consecutive hyphens + .replace(/^-|-$/g, ""); // strip leading/trailing hyphens +} + +function splitByTarget(inputPath: string, outputDir: string): void { + const content = readFileSync(inputPath, "utf-8"); + const lines = content.split("\n").filter((line) => line.trim().length > 0); + + if (lines.length === 0) { + console.error("Error: input file is empty or contains no valid lines."); + process.exit(1); + } + + const groups = new Map(); + + for (const line of lines) { + let record: { target?: string }; + try { + record = JSON.parse(line); + } catch { + console.error(`Warning: skipping non-JSON line: ${line.slice(0, 80)}`); + continue; + } + + const target = record.target ?? "unknown"; + if (!groups.has(target)) { + groups.set(target, []); + } + groups.get(target)!.push(line); + } + + if (!existsSync(outputDir)) { + mkdirSync(outputDir, { recursive: true }); + } + + const inputBase = basename(inputPath, ".jsonl"); + + for (const [target, records] of groups) { + const safeName = normalizeTargetName(target) || "unknown"; + const outFile = resolve(outputDir, `${inputBase}.${safeName}.jsonl`); + writeFileSync(outFile, records.join("\n") + "\n"); + console.log(` ${outFile} (${records.length} records)`); + } + + console.log(`\nSplit ${lines.length} records into ${groups.size} target file(s).`); +} + +// --- CLI entry point --- + +const args = process.argv.slice(2); + +if (args.length === 0 || args.includes("--help") || args.includes("-h")) { + console.log( + "Usage: bun split-by-target.ts [output-dir]\n\nSplits a combined results JSONL into one file per target.", + ); + process.exit(0); +} + +const inputPath = resolve(args[0]); +const outputDir = args[1] ? resolve(args[1]) : dirname(inputPath); + +if (!existsSync(inputPath)) { + console.error(`Error: input file not found: ${inputPath}`); + process.exit(1); +} + +console.log(`Splitting ${inputPath} by target → ${outputDir}\n`); +splitByTarget(inputPath, outputDir);