EntityProcess · christso · Feb 25, 2026 · Feb 25, 2026
diff --git a/examples/features/benchmark-tooling/README.md b/examples/features/benchmark-tooling/README.md
@@ -0,0 +1,54 @@
+# Benchmark Tooling
+
+Utilities for multi-model benchmarking workflows with AgentV.
+
+## split-by-target
+
+Splits a combined results JSONL file into one file per `target`, enabling pairwise comparison with `agentv compare`.
+
+### Usage
+
+```bash
+# Split into the same directory as the input file
+bun examples/features/benchmark-tooling/scripts/split-by-target.ts results.jsonl
+
+# Split into a specific output directory
+bun examples/features/benchmark-tooling/scripts/split-by-target.ts results.jsonl ./split-output
+```
+
+Given a combined `results.jsonl` containing records for targets `gpt-4.1` and `claude-sonnet-4`:
+
+```
+results.gpt-4.1.jsonl          (records where target == "gpt-4.1")
+results.claude-sonnet-4.jsonl  (records where target == "claude-sonnet-4")
+```
+
+### Filename Normalization
+
+Target names are normalized for safe filenames:
+
+| Target value | Output filename |
+|---|---|
+| `gpt-4.1` | `results.gpt-4.1.jsonl` |
+| `Claude Sonnet 4` | `results.claude-sonnet-4.jsonl` |
+| `azure/gpt-4o` | `results.azure-gpt-4o.jsonl` |
+
+### Downstream Compare Workflow
+
+After splitting, use `agentv compare` to perform pairwise model comparisons:
+
+```bash
+# 1. Run a matrix evaluation that produces a combined results file
+bun agentv eval my-eval.yaml
+
+# 2. Split results by target
+bun examples/features/benchmark-tooling/scripts/split-by-target.ts results.jsonl ./by-target
+
+# 3. Compare any two targets
+bun agentv compare ./by-target/results.gpt-4.1.jsonl ./by-target/results.claude-sonnet-4.jsonl
+
+# 4. JSON output for CI pipelines
+bun agentv compare ./by-target/results.gpt-4.1.jsonl ./by-target/results.claude-sonnet-4.jsonl --json
+```
+
+The `compare` command matches records by `test_id`, calculates score deltas, and classifies each as win/loss/tie. It exits non-zero on regressions, making it suitable for CI gates.
diff --git a/examples/features/benchmark-tooling/scripts/split-by-target.ts b/examples/features/benchmark-tooling/scripts/split-by-target.ts
@@ -0,0 +1,86 @@
+#!/usr/bin/env bun
+/**
+ * split-by-target — Split a combined results JSONL file into one file per target.
+ *
+ * Usage:
+ *   bun examples/features/benchmark-tooling/scripts/split-by-target.ts <input.jsonl> [output-dir]
+ *
+ * Output directory defaults to the same directory as the input file.
+ */
+
+import { readFileSync, writeFileSync, mkdirSync, existsSync } from "node:fs";
+import { resolve, dirname, basename } from "node:path";
+
+function normalizeTargetName(target: string): string {
+  return target
+    .trim()
+    .toLowerCase()
+    .replace(/[^a-z0-9._-]+/g, "-") // replace unsafe chars with hyphens
+    .replace(/-+/g, "-") // collapse consecutive hyphens
+    .replace(/^-|-$/g, ""); // strip leading/trailing hyphens
+}
+
+function splitByTarget(inputPath: string, outputDir: string): void {
+  const content = readFileSync(inputPath, "utf-8");
+  const lines = content.split("\n").filter((line) => line.trim().length > 0);
+
+  if (lines.length === 0) {
+    console.error("Error: input file is empty or contains no valid lines.");
+    process.exit(1);
+  }
+
+  const groups = new Map<string, string[]>();
+
+  for (const line of lines) {
+    let record: { target?: string };
+    try {
+      record = JSON.parse(line);
+    } catch {
+      console.error(`Warning: skipping non-JSON line: ${line.slice(0, 80)}`);
+      continue;
+    }
+
+    const target = record.target ?? "unknown";
+    if (!groups.has(target)) {
+      groups.set(target, []);
+    }
+    groups.get(target)!.push(line);
+  }
+
+  if (!existsSync(outputDir)) {
+    mkdirSync(outputDir, { recursive: true });
+  }
+
+  const inputBase = basename(inputPath, ".jsonl");
+
+  for (const [target, records] of groups) {
+    const safeName = normalizeTargetName(target) || "unknown";
+    const outFile = resolve(outputDir, `${inputBase}.${safeName}.jsonl`);
+    writeFileSync(outFile, records.join("\n") + "\n");
+    console.log(`  ${outFile} (${records.length} records)`);
+  }
+
+  console.log(`\nSplit ${lines.length} records into ${groups.size} target file(s).`);
+}
+
+// --- CLI entry point ---
+
+const args = process.argv.slice(2);
+
+if (args.length === 0 || args.includes("--help") || args.includes("-h")) {
+  console.log(
+    "Usage: bun split-by-target.ts <input.jsonl> [output-dir]\n\nSplits a combined results JSONL into one file per target.",
+  );
+  process.exit(0);
+}
+
+const inputPath = resolve(args[0]);
+const outputDir = args[1] ? resolve(args[1]) : dirname(inputPath);
+
+if (!existsSync(inputPath)) {
+  console.error(`Error: input file not found: ${inputPath}`);
+  process.exit(1);
+}
+
+console.log(`Splitting ${inputPath} by target → ${outputDir}\n`);
+splitByTarget(inputPath, outputDir);