Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions examples/features/benchmark-tooling/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Benchmark Tooling

Utilities for multi-model benchmarking workflows with AgentV.

## split-by-target

Splits a combined results JSONL file into one file per `target`, enabling pairwise comparison with `agentv compare`.

### Usage

```bash
# Split into the same directory as the input file
bun examples/features/benchmark-tooling/scripts/split-by-target.ts results.jsonl

# Split into a specific output directory
bun examples/features/benchmark-tooling/scripts/split-by-target.ts results.jsonl ./split-output
```

Given a combined `results.jsonl` containing records for targets `gpt-4.1` and `claude-sonnet-4`:

```
results.gpt-4.1.jsonl (records where target == "gpt-4.1")
results.claude-sonnet-4.jsonl (records where target == "claude-sonnet-4")
```

### Filename Normalization

Target names are normalized for safe filenames:

| Target value | Output filename |
|---|---|
| `gpt-4.1` | `results.gpt-4.1.jsonl` |
| `Claude Sonnet 4` | `results.claude-sonnet-4.jsonl` |
| `azure/gpt-4o` | `results.azure-gpt-4o.jsonl` |

### Downstream Compare Workflow

After splitting, use `agentv compare` to perform pairwise model comparisons:

```bash
# 1. Run a matrix evaluation that produces a combined results file
bun agentv eval my-eval.yaml

# 2. Split results by target
bun examples/features/benchmark-tooling/scripts/split-by-target.ts results.jsonl ./by-target

# 3. Compare any two targets
bun agentv compare ./by-target/results.gpt-4.1.jsonl ./by-target/results.claude-sonnet-4.jsonl

# 4. JSON output for CI pipelines
bun agentv compare ./by-target/results.gpt-4.1.jsonl ./by-target/results.claude-sonnet-4.jsonl --json
```

The `compare` command matches records by `test_id`, calculates score deltas, and classifies each as win/loss/tie. It exits non-zero on regressions, making it suitable for CI gates.
86 changes: 86 additions & 0 deletions examples/features/benchmark-tooling/scripts/split-by-target.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#!/usr/bin/env bun
/**
* split-by-target — Split a combined results JSONL file into one file per target.
*
* Usage:
* bun examples/features/benchmark-tooling/scripts/split-by-target.ts <input.jsonl> [output-dir]
*
* Output directory defaults to the same directory as the input file.
*/

import { readFileSync, writeFileSync, mkdirSync, existsSync } from "node:fs";
import { resolve, dirname, basename } from "node:path";

function normalizeTargetName(target: string): string {
return target
.trim()
.toLowerCase()
.replace(/[^a-z0-9._-]+/g, "-") // replace unsafe chars with hyphens
.replace(/-+/g, "-") // collapse consecutive hyphens
.replace(/^-|-$/g, ""); // strip leading/trailing hyphens
}

function splitByTarget(inputPath: string, outputDir: string): void {
const content = readFileSync(inputPath, "utf-8");
const lines = content.split("\n").filter((line) => line.trim().length > 0);

if (lines.length === 0) {
console.error("Error: input file is empty or contains no valid lines.");
process.exit(1);
}

const groups = new Map<string, string[]>();

for (const line of lines) {
let record: { target?: string };
try {
record = JSON.parse(line);
} catch {
console.error(`Warning: skipping non-JSON line: ${line.slice(0, 80)}`);
continue;
}

const target = record.target ?? "unknown";
if (!groups.has(target)) {
groups.set(target, []);
}
groups.get(target)!.push(line);
}

if (!existsSync(outputDir)) {
mkdirSync(outputDir, { recursive: true });
}

const inputBase = basename(inputPath, ".jsonl");

for (const [target, records] of groups) {
const safeName = normalizeTargetName(target) || "unknown";
const outFile = resolve(outputDir, `${inputBase}.${safeName}.jsonl`);
writeFileSync(outFile, records.join("\n") + "\n");
console.log(` ${outFile} (${records.length} records)`);
}

console.log(`\nSplit ${lines.length} records into ${groups.size} target file(s).`);
}

// --- CLI entry point ---

const args = process.argv.slice(2);

if (args.length === 0 || args.includes("--help") || args.includes("-h")) {
console.log(
"Usage: bun split-by-target.ts <input.jsonl> [output-dir]\n\nSplits a combined results JSONL into one file per target.",
);
process.exit(0);
}

const inputPath = resolve(args[0]);
const outputDir = args[1] ? resolve(args[1]) : dirname(inputPath);

if (!existsSync(inputPath)) {
console.error(`Error: input file not found: ${inputPath}`);
process.exit(1);
}

console.log(`Splitting ${inputPath} by target → ${outputDir}\n`);
splitByTarget(inputPath, outputDir);