EntityProcess · christso · Mar 19, 2026 · Mar 19, 2026 · Mar 19, 2026
diff --git a/README.md b/README.md
@@ -164,7 +164,7 @@ For large-scale evaluations, AgentV supports JSONL (JSON Lines) format as an alt
 Optional sidecar YAML metadata file (`dataset.eval.yaml` alongside `dataset.jsonl`):
 ```yaml
 description: Math evaluation dataset
-dataset: math-tests
+name: math-tests
 execution:
   target: azure-llm
 assertions:

diff --git a/apps/cli/README.md b/apps/cli/README.md
@@ -164,7 +164,7 @@ For large-scale evaluations, AgentV supports JSONL (JSON Lines) format as an alt
 Optional sidecar YAML metadata file (`dataset.eval.yaml` alongside `dataset.jsonl`):
 ```yaml
 description: Math evaluation dataset
-dataset: math-tests
+name: math-tests
 execution:
   target: azure-llm
 assertions:

diff --git a/apps/cli/src/commands/eval/junit-writer.ts b/apps/cli/src/commands/eval/junit-writer.ts
@@ -41,7 +41,7 @@ export class JunitWriter {
 
     const grouped = new Map<string, EvaluationResult[]>();
     for (const result of this.results) {
-      const suite = result.dataset ?? 'default';
+      const suite = result.eval_set ?? 'default';
       const existing = grouped.get(suite);
       if (existing) {
         existing.push(result);

diff --git a/apps/cli/src/commands/trace/show.ts b/apps/cli/src/commands/trace/show.ts
@@ -185,7 +185,7 @@ function formatResultDetail(result: RawResult, index: number, tree: boolean): st
   // Standard flat view
   const scoreColor = result.score >= 0.9 ? c.green : result.score >= 0.5 ? c.yellow : c.red;
   lines.push(
-    `${c.bold}${testId}${c.reset}  ${scoreColor}${formatScore(result.score)}${c.reset}${result.target ? `  ${c.dim}target: ${result.target}${c.reset}` : ''}${result.dataset ? `  ${c.dim}dataset: ${result.dataset}${c.reset}` : ''}`,
+    `${c.bold}${testId}${c.reset}  ${scoreColor}${formatScore(result.score)}${c.reset}${result.target ? `  ${c.dim}target: ${result.target}${c.reset}` : ''}${result.eval_set ? `  ${c.dim}eval-set: ${result.eval_set}${c.reset}` : ''}`,
   );
 
   if (result.error) {

diff --git a/apps/cli/src/commands/trace/stats.ts b/apps/cli/src/commands/trace/stats.ts
@@ -108,8 +108,8 @@ function groupResults(results: RawResult[], groupBy?: string): GroupedResults[]
       case 'target':
         key = result.target ?? 'unknown';
         break;
-      case 'dataset':
-        key = result.dataset ?? 'unknown';
+      case 'eval-set':
+        key = result.eval_set ?? 'unknown';
         break;
       case 'test-id':
         key = result.test_id ?? result.eval_id ?? 'unknown';
@@ -220,10 +220,10 @@ export const traceStatsCommand = command({
       description: 'Path to JSONL result file',
     }),
     groupBy: option({
-      type: optional(oneOf(['target', 'dataset', 'test-id'])),
+      type: optional(oneOf(['target', 'eval-set', 'test-id'])),
       long: 'group-by',
       short: 'g',
-      description: 'Group statistics by: target, dataset, or test-id',
+      description: 'Group statistics by: target, eval-set, or test-id',
     }),
     format: option({
       type: optional(oneOf(['table', 'json'])),

diff --git a/apps/cli/src/commands/trace/utils.ts b/apps/cli/src/commands/trace/utils.ts
@@ -42,7 +42,7 @@ export interface RawResult {
   timestamp?: string;
   test_id?: string;
   eval_id?: string;
-  dataset?: string;
+  eval_set?: string;
   conversation_id?: string;
   score: number;
   assertions?: { text: string; passed: boolean; evidence?: string }[];

diff --git a/apps/cli/test/commands/eval/output-writers.test.ts b/apps/cli/test/commands/eval/output-writers.test.ts
@@ -125,9 +125,9 @@ describe('JunitWriter', () => {
   it('should group results by dataset as testsuites', async () => {
     const writer = await JunitWriter.open(testFilePath);
 
-    await writer.append(makeResult({ testId: 'a-1', dataset: 'suite-a', score: 1.0 }));
-    await writer.append(makeResult({ testId: 'a-2', dataset: 'suite-a', score: 0.8 }));
-    await writer.append(makeResult({ testId: 'b-1', dataset: 'suite-b', score: 0.5 }));
+    await writer.append(makeResult({ testId: 'a-1', eval_set: 'suite-a', score: 1.0 }));
+    await writer.append(makeResult({ testId: 'a-2', eval_set: 'suite-a', score: 0.8 }));
+    await writer.append(makeResult({ testId: 'b-1', eval_set: 'suite-b', score: 0.5 }));
     await writer.close();
 
     const xml = await readFile(testFilePath, 'utf8');

diff --git a/apps/cli/test/commands/results/export-e2e-providers.test.ts b/apps/cli/test/commands/results/export-e2e-providers.test.ts
@@ -23,7 +23,7 @@ import { exportResults } from '../../../src/commands/results/export.js';
 const CLAUDE_CLI_RESULT = {
   timestamp: '2026-03-18T10:00:00.000Z',
   test_id: 'test-claude-reasoning',
-  dataset: 'multi-provider',
+  eval_set: 'multi-provider',
   score: 1.0,
   assertions: [
     { text: 'Correct answer', passed: true, evidence: 'Matched expected output' },
@@ -60,7 +60,7 @@ const CLAUDE_CLI_RESULT = {
 const CODEX_RESULT = {
   timestamp: '2026-03-18T10:01:00.000Z',
   test_id: 'test-codex-edit',
-  dataset: 'multi-provider',
+  eval_set: 'multi-provider',
   score: 0.9,
   assertions: [
     { text: 'File edited correctly', passed: true },
@@ -96,7 +96,7 @@ const CODEX_RESULT = {
 const COPILOT_RESULT = {
   timestamp: '2026-03-18T10:02:00.000Z',
   test_id: 'test-copilot-complete',
-  dataset: 'multi-provider',
+  eval_set: 'multi-provider',
   score: 0.85,
   assertions: [
     { text: 'Code completion correct', passed: true },
@@ -125,7 +125,7 @@ const COPILOT_RESULT = {
 const PI_RESULT = {
   timestamp: '2026-03-18T10:03:00.000Z',
   test_id: 'test-pi-refactor',
-  dataset: 'multi-provider',
+  eval_set: 'multi-provider',
   score: 0.75,
   assertions: [
     { text: 'Refactored correctly', passed: true },
@@ -143,7 +143,7 @@ const PI_RESULT = {
 const LLM_AZURE_RESULT = {
   timestamp: '2026-03-18T10:04:00.000Z',
   test_id: 'test-llm-analysis',
-  dataset: 'multi-provider',
+  eval_set: 'multi-provider',
   score: 1.0,
   assertions: [{ text: 'Analysis correct', passed: true }],
   output: [{ role: 'assistant', content: 'The code has a race condition in the connection pool.' }],
@@ -166,7 +166,7 @@ const LLM_AZURE_RESULT = {
 const LLM_GPT_RESULT = {
   timestamp: '2026-03-18T10:05:00.000Z',
   test_id: 'test-llm-analysis',
-  dataset: 'multi-provider',
+  eval_set: 'multi-provider',
   score: 0.8,
   assertions: [{ text: 'Analysis correct', passed: true }],
   output: [{ role: 'assistant', content: 'There might be a concurrency issue.' }],
@@ -181,7 +181,7 @@ const LLM_GPT_RESULT = {
 const MINIMAL_RESULT = {
   timestamp: '2026-03-18T10:06:00.000Z',
   test_id: 'test-minimal',
-  dataset: 'multi-provider',
+  eval_set: 'multi-provider',
   score: 0.5,
   assertions: [{ text: 'Exists', passed: true }],
   output: [{ role: 'assistant', content: 'Response.' }],
@@ -193,7 +193,7 @@ const MINIMAL_RESULT = {
 const ERROR_RESULT = {
   timestamp: '2026-03-18T10:07:00.000Z',
   test_id: 'test-error-case',
-  dataset: 'multi-provider',
+  eval_set: 'multi-provider',
   score: 0,
   assertions: [],
   output: [],
@@ -623,7 +623,7 @@ describe('export e2e — multi-provider metrics verification', () => {
       const record = {
         timestamp: '2026-03-18T10:00:00.000Z',
         test_id: 'test-case-convert',
-        dataset: 'test',
+        eval_set: 'test',
         score: 1.0,
         assertions: [{ text: 'ok', passed: true }],
         output_text: 'ok',
@@ -655,7 +655,7 @@ describe('export e2e — multi-provider metrics verification', () => {
       const record = {
         timestamp: '2026-03-18T10:00:00.000Z',
         eval_id: 'legacy-test-id',
-        dataset: 'test',
+        eval_set: 'test',
         score: 1.0,
         assertions: [{ text: 'ok', passed: true }],
         output_text: 'ok',

diff --git a/apps/cli/test/commands/results/export.test.ts b/apps/cli/test/commands/results/export.test.ts
@@ -15,7 +15,7 @@ import { exportResults } from '../../../src/commands/results/export.js';
 const RESULT_FULL = {
   timestamp: '2026-03-18T10:00:01.000Z',
   test_id: 'test-greeting',
-  dataset: 'demo',
+  eval_set: 'demo',
   score: 1.0,
   assertions: [
     { text: 'Says hello', passed: true },
@@ -39,7 +39,7 @@ const RESULT_FULL = {
 const RESULT_PARTIAL = {
   timestamp: '2026-03-18T10:00:05.000Z',
   test_id: 'test-math',
-  dataset: 'demo',
+  eval_set: 'demo',
   score: 0.5,
   assertions: [
     { text: 'Correct formula', passed: true },
@@ -65,7 +65,7 @@ const RESULT_PARTIAL = {
 const RESULT_DIFFERENT_TARGET = {
   timestamp: '2026-03-18T10:00:10.000Z',
   test_id: 'test-greeting',
-  dataset: 'demo',
+  eval_set: 'demo',
   score: 0.75,
   assertions: [
     { text: 'Says hello', passed: true },
@@ -80,7 +80,7 @@ const RESULT_DIFFERENT_TARGET = {
 const RESULT_NO_TRACE = {
   timestamp: '2026-03-18T10:00:15.000Z',
   test_id: 'test-simple',
-  dataset: 'demo',
+  eval_set: 'demo',
   score: 1.0,
   assertions: [{ text: 'Correct', passed: true }],
   output: [{ role: 'assistant', content: 'Yes.' }],

diff --git a/apps/cli/test/commands/trace/trace.test.ts b/apps/cli/test/commands/trace/trace.test.ts
@@ -16,7 +16,7 @@ import {
 const RESULT_WITH_TRACE = JSON.stringify({
   timestamp: '2026-02-20T21:38:05.833Z',
   test_id: 'test-1',
-  dataset: 'demo',
+  eval_set: 'demo',
   score: 1,
   assertions: [{ text: 'criterion-1', passed: true }],
   target: 'default',
@@ -34,7 +34,7 @@ const RESULT_WITH_TRACE = JSON.stringify({
 const RESULT_WITHOUT_TRACE = JSON.stringify({
   timestamp: '2026-02-20T21:38:06.000Z',
   test_id: 'test-2',
-  dataset: 'demo',
+  eval_set: 'demo',
   score: 0.75,
   assertions: [
     { text: 'criterion-1', passed: true },
@@ -46,7 +46,7 @@ const RESULT_WITHOUT_TRACE = JSON.stringify({
 const RESULT_FAILING = JSON.stringify({
   timestamp: '2026-02-20T21:38:07.000Z',
   test_id: 'test-3',
-  dataset: 'demo',
+  eval_set: 'demo',
   score: 0,
   assertions: [
     { text: 'criterion-1', passed: false },

diff --git a/examples/features/assert/evals/dataset.eval.baseline.jsonl b/examples/features/assert/evals/dataset.eval.baseline.jsonl
@@ -1,4 +1,4 @@
-{"timestamp":"2026-02-20T21:38:00.970Z","test_id":"regex-check","dataset":"dataset","score":1,"target":"default","scores":[{"name":"regex-Good (morning|afternoon|evenin","type":"regex","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output matches pattern /Good (morning|afternoon|evening)/","passed":true}]}],"assertions":[{"text":"Output matches pattern /Good (morning|afternoon|evening)/","passed":true,"evidence":"regex-Good (morning|afternoon|evenin: Output matches pattern /Good (morning|afternoon|evening)/"}]}
-{"timestamp":"2026-02-20T21:38:00.975Z","test_id":"contains-check","dataset":"dataset","score":1,"target":"default","scores":[{"name":"contains-Hello","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"Hello\"","passed":true}]},{"name":"regex-[Hh]ello","type":"regex","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output matches pattern /[Hh]ello/","passed":true}]}],"assertions":[{"text":"Output contains \"Hello\"","passed":true,"evidence":"contains-Hello: Output contains \"Hello\" | regex-[Hh]ello: Output matches pattern /[Hh]ello/"},{"text":"Output matches pattern /[Hh]ello/","passed":true}]}
-{"timestamp":"2026-02-20T21:38:01.114Z","test_id":"json-response","dataset":"dataset","score":1,"target":"default","scores":[{"name":"is_json","type":"is-json","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output is valid JSON","passed":true}]},{"name":"contains-\"status\"","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"\"status\"\"","passed":true}]},{"name":"contains-\"ok\"","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"\"ok\"\"","passed":true}]}],"assertions":[{"text":"Output is valid JSON","passed":true,"evidence":"is_json: Output is valid JSON | contains-\"status\": Output contains \"\"status\"\" | contains-\"ok\": Output contains \"\"ok\"\""},{"text":"Output contains \"\"status\"\"","passed":true},{"text":"Output contains \"\"ok\"\"","passed":true}]}
-{"timestamp":"2026-02-20T21:38:01.516Z","test_id":"equals-check","dataset":"dataset","score":1,"target":"default","scores":[{"name":"equals-4","type":"equals","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output equals \"4\"","passed":true}]}],"assertions":[{"text":"Output equals \"4\"","passed":true,"evidence":"equals-4: Output equals \"4\""}]}
+{"timestamp":"2026-02-20T21:38:00.970Z","test_id":"regex-check","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"regex-Good (morning|afternoon|evenin","type":"regex","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output matches pattern /Good (morning|afternoon|evening)/","passed":true}]}],"assertions":[{"text":"Output matches pattern /Good (morning|afternoon|evening)/","passed":true,"evidence":"regex-Good (morning|afternoon|evenin: Output matches pattern /Good (morning|afternoon|evening)/"}]}
+{"timestamp":"2026-02-20T21:38:00.975Z","test_id":"contains-check","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"contains-Hello","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"Hello\"","passed":true}]},{"name":"regex-[Hh]ello","type":"regex","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output matches pattern /[Hh]ello/","passed":true}]}],"assertions":[{"text":"Output contains \"Hello\"","passed":true,"evidence":"contains-Hello: Output contains \"Hello\" | regex-[Hh]ello: Output matches pattern /[Hh]ello/"},{"text":"Output matches pattern /[Hh]ello/","passed":true}]}
+{"timestamp":"2026-02-20T21:38:01.114Z","test_id":"json-response","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"is_json","type":"is-json","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output is valid JSON","passed":true}]},{"name":"contains-\"status\"","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"\"status\"\"","passed":true}]},{"name":"contains-\"ok\"","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"\"ok\"\"","passed":true}]}],"assertions":[{"text":"Output is valid JSON","passed":true,"evidence":"is_json: Output is valid JSON | contains-\"status\": Output contains \"\"status\"\" | contains-\"ok\": Output contains \"\"ok\"\""},{"text":"Output contains \"\"status\"\"","passed":true},{"text":"Output contains \"\"ok\"\"","passed":true}]}
+{"timestamp":"2026-02-20T21:38:01.516Z","test_id":"equals-check","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"equals-4","type":"equals","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output equals \"4\"","passed":true}]}],"assertions":[{"text":"Output equals \"4\"","passed":true,"evidence":"equals-4: Output equals \"4\""}]}