Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ For large-scale evaluations, AgentV supports JSONL (JSON Lines) format as an alt
Optional sidecar YAML metadata file (`dataset.eval.yaml` alongside `dataset.jsonl`):
```yaml
description: Math evaluation dataset
dataset: math-tests
name: math-tests
execution:
target: azure-llm
assertions:
Expand Down
2 changes: 1 addition & 1 deletion apps/cli/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ For large-scale evaluations, AgentV supports JSONL (JSON Lines) format as an alt
Optional sidecar YAML metadata file (`dataset.eval.yaml` alongside `dataset.jsonl`):
```yaml
description: Math evaluation dataset
dataset: math-tests
name: math-tests
execution:
target: azure-llm
assertions:
Expand Down
2 changes: 1 addition & 1 deletion apps/cli/src/commands/eval/junit-writer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ export class JunitWriter {

const grouped = new Map<string, EvaluationResult[]>();
for (const result of this.results) {
const suite = result.dataset ?? 'default';
const suite = result.eval_set ?? 'default';
const existing = grouped.get(suite);
if (existing) {
existing.push(result);
Expand Down
2 changes: 1 addition & 1 deletion apps/cli/src/commands/trace/show.ts
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ function formatResultDetail(result: RawResult, index: number, tree: boolean): st
// Standard flat view
const scoreColor = result.score >= 0.9 ? c.green : result.score >= 0.5 ? c.yellow : c.red;
lines.push(
`${c.bold}${testId}${c.reset} ${scoreColor}${formatScore(result.score)}${c.reset}${result.target ? ` ${c.dim}target: ${result.target}${c.reset}` : ''}${result.dataset ? ` ${c.dim}dataset: ${result.dataset}${c.reset}` : ''}`,
`${c.bold}${testId}${c.reset} ${scoreColor}${formatScore(result.score)}${c.reset}${result.target ? ` ${c.dim}target: ${result.target}${c.reset}` : ''}${result.eval_set ? ` ${c.dim}eval-set: ${result.eval_set}${c.reset}` : ''}`,
);

if (result.error) {
Expand Down
8 changes: 4 additions & 4 deletions apps/cli/src/commands/trace/stats.ts
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,8 @@ function groupResults(results: RawResult[], groupBy?: string): GroupedResults[]
case 'target':
key = result.target ?? 'unknown';
break;
case 'dataset':
key = result.dataset ?? 'unknown';
case 'eval-set':
key = result.eval_set ?? 'unknown';
break;
case 'test-id':
key = result.test_id ?? result.eval_id ?? 'unknown';
Expand Down Expand Up @@ -220,10 +220,10 @@ export const traceStatsCommand = command({
description: 'Path to JSONL result file',
}),
groupBy: option({
type: optional(oneOf(['target', 'dataset', 'test-id'])),
type: optional(oneOf(['target', 'eval-set', 'test-id'])),
long: 'group-by',
short: 'g',
description: 'Group statistics by: target, dataset, or test-id',
description: 'Group statistics by: target, eval-set, or test-id',
}),
format: option({
type: optional(oneOf(['table', 'json'])),
Expand Down
2 changes: 1 addition & 1 deletion apps/cli/src/commands/trace/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ export interface RawResult {
timestamp?: string;
test_id?: string;
eval_id?: string;
dataset?: string;
eval_set?: string;
conversation_id?: string;
score: number;
assertions?: { text: string; passed: boolean; evidence?: string }[];
Expand Down
6 changes: 3 additions & 3 deletions apps/cli/test/commands/eval/output-writers.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,9 @@ describe('JunitWriter', () => {
it('should group results by dataset as testsuites', async () => {
const writer = await JunitWriter.open(testFilePath);

await writer.append(makeResult({ testId: 'a-1', dataset: 'suite-a', score: 1.0 }));
await writer.append(makeResult({ testId: 'a-2', dataset: 'suite-a', score: 0.8 }));
await writer.append(makeResult({ testId: 'b-1', dataset: 'suite-b', score: 0.5 }));
await writer.append(makeResult({ testId: 'a-1', eval_set: 'suite-a', score: 1.0 }));
await writer.append(makeResult({ testId: 'a-2', eval_set: 'suite-a', score: 0.8 }));
await writer.append(makeResult({ testId: 'b-1', eval_set: 'suite-b', score: 0.5 }));
await writer.close();

const xml = await readFile(testFilePath, 'utf8');
Expand Down
20 changes: 10 additions & 10 deletions apps/cli/test/commands/results/export-e2e-providers.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import { exportResults } from '../../../src/commands/results/export.js';
const CLAUDE_CLI_RESULT = {
timestamp: '2026-03-18T10:00:00.000Z',
test_id: 'test-claude-reasoning',
dataset: 'multi-provider',
eval_set: 'multi-provider',
score: 1.0,
assertions: [
{ text: 'Correct answer', passed: true, evidence: 'Matched expected output' },
Expand Down Expand Up @@ -60,7 +60,7 @@ const CLAUDE_CLI_RESULT = {
const CODEX_RESULT = {
timestamp: '2026-03-18T10:01:00.000Z',
test_id: 'test-codex-edit',
dataset: 'multi-provider',
eval_set: 'multi-provider',
score: 0.9,
assertions: [
{ text: 'File edited correctly', passed: true },
Expand Down Expand Up @@ -96,7 +96,7 @@ const CODEX_RESULT = {
const COPILOT_RESULT = {
timestamp: '2026-03-18T10:02:00.000Z',
test_id: 'test-copilot-complete',
dataset: 'multi-provider',
eval_set: 'multi-provider',
score: 0.85,
assertions: [
{ text: 'Code completion correct', passed: true },
Expand Down Expand Up @@ -125,7 +125,7 @@ const COPILOT_RESULT = {
const PI_RESULT = {
timestamp: '2026-03-18T10:03:00.000Z',
test_id: 'test-pi-refactor',
dataset: 'multi-provider',
eval_set: 'multi-provider',
score: 0.75,
assertions: [
{ text: 'Refactored correctly', passed: true },
Expand All @@ -143,7 +143,7 @@ const PI_RESULT = {
const LLM_AZURE_RESULT = {
timestamp: '2026-03-18T10:04:00.000Z',
test_id: 'test-llm-analysis',
dataset: 'multi-provider',
eval_set: 'multi-provider',
score: 1.0,
assertions: [{ text: 'Analysis correct', passed: true }],
output: [{ role: 'assistant', content: 'The code has a race condition in the connection pool.' }],
Expand All @@ -166,7 +166,7 @@ const LLM_AZURE_RESULT = {
const LLM_GPT_RESULT = {
timestamp: '2026-03-18T10:05:00.000Z',
test_id: 'test-llm-analysis',
dataset: 'multi-provider',
eval_set: 'multi-provider',
score: 0.8,
assertions: [{ text: 'Analysis correct', passed: true }],
output: [{ role: 'assistant', content: 'There might be a concurrency issue.' }],
Expand All @@ -181,7 +181,7 @@ const LLM_GPT_RESULT = {
const MINIMAL_RESULT = {
timestamp: '2026-03-18T10:06:00.000Z',
test_id: 'test-minimal',
dataset: 'multi-provider',
eval_set: 'multi-provider',
score: 0.5,
assertions: [{ text: 'Exists', passed: true }],
output: [{ role: 'assistant', content: 'Response.' }],
Expand All @@ -193,7 +193,7 @@ const MINIMAL_RESULT = {
const ERROR_RESULT = {
timestamp: '2026-03-18T10:07:00.000Z',
test_id: 'test-error-case',
dataset: 'multi-provider',
eval_set: 'multi-provider',
score: 0,
assertions: [],
output: [],
Expand Down Expand Up @@ -623,7 +623,7 @@ describe('export e2e — multi-provider metrics verification', () => {
const record = {
timestamp: '2026-03-18T10:00:00.000Z',
test_id: 'test-case-convert',
dataset: 'test',
eval_set: 'test',
score: 1.0,
assertions: [{ text: 'ok', passed: true }],
output_text: 'ok',
Expand Down Expand Up @@ -655,7 +655,7 @@ describe('export e2e — multi-provider metrics verification', () => {
const record = {
timestamp: '2026-03-18T10:00:00.000Z',
eval_id: 'legacy-test-id',
dataset: 'test',
eval_set: 'test',
score: 1.0,
assertions: [{ text: 'ok', passed: true }],
output_text: 'ok',
Expand Down
8 changes: 4 additions & 4 deletions apps/cli/test/commands/results/export.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ import { exportResults } from '../../../src/commands/results/export.js';
const RESULT_FULL = {
timestamp: '2026-03-18T10:00:01.000Z',
test_id: 'test-greeting',
dataset: 'demo',
eval_set: 'demo',
score: 1.0,
assertions: [
{ text: 'Says hello', passed: true },
Expand All @@ -39,7 +39,7 @@ const RESULT_FULL = {
const RESULT_PARTIAL = {
timestamp: '2026-03-18T10:00:05.000Z',
test_id: 'test-math',
dataset: 'demo',
eval_set: 'demo',
score: 0.5,
assertions: [
{ text: 'Correct formula', passed: true },
Expand All @@ -65,7 +65,7 @@ const RESULT_PARTIAL = {
const RESULT_DIFFERENT_TARGET = {
timestamp: '2026-03-18T10:00:10.000Z',
test_id: 'test-greeting',
dataset: 'demo',
eval_set: 'demo',
score: 0.75,
assertions: [
{ text: 'Says hello', passed: true },
Expand All @@ -80,7 +80,7 @@ const RESULT_DIFFERENT_TARGET = {
const RESULT_NO_TRACE = {
timestamp: '2026-03-18T10:00:15.000Z',
test_id: 'test-simple',
dataset: 'demo',
eval_set: 'demo',
score: 1.0,
assertions: [{ text: 'Correct', passed: true }],
output: [{ role: 'assistant', content: 'Yes.' }],
Expand Down
6 changes: 3 additions & 3 deletions apps/cli/test/commands/trace/trace.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ import {
const RESULT_WITH_TRACE = JSON.stringify({
timestamp: '2026-02-20T21:38:05.833Z',
test_id: 'test-1',
dataset: 'demo',
eval_set: 'demo',
score: 1,
assertions: [{ text: 'criterion-1', passed: true }],
target: 'default',
Expand All @@ -34,7 +34,7 @@ const RESULT_WITH_TRACE = JSON.stringify({
const RESULT_WITHOUT_TRACE = JSON.stringify({
timestamp: '2026-02-20T21:38:06.000Z',
test_id: 'test-2',
dataset: 'demo',
eval_set: 'demo',
score: 0.75,
assertions: [
{ text: 'criterion-1', passed: true },
Expand All @@ -46,7 +46,7 @@ const RESULT_WITHOUT_TRACE = JSON.stringify({
const RESULT_FAILING = JSON.stringify({
timestamp: '2026-02-20T21:38:07.000Z',
test_id: 'test-3',
dataset: 'demo',
eval_set: 'demo',
score: 0,
assertions: [
{ text: 'criterion-1', passed: false },
Expand Down
8 changes: 4 additions & 4 deletions examples/features/assert/evals/dataset.eval.baseline.jsonl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{"timestamp":"2026-02-20T21:38:00.970Z","test_id":"regex-check","dataset":"dataset","score":1,"target":"default","scores":[{"name":"regex-Good (morning|afternoon|evenin","type":"regex","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output matches pattern /Good (morning|afternoon|evening)/","passed":true}]}],"assertions":[{"text":"Output matches pattern /Good (morning|afternoon|evening)/","passed":true,"evidence":"regex-Good (morning|afternoon|evenin: Output matches pattern /Good (morning|afternoon|evening)/"}]}
{"timestamp":"2026-02-20T21:38:00.975Z","test_id":"contains-check","dataset":"dataset","score":1,"target":"default","scores":[{"name":"contains-Hello","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"Hello\"","passed":true}]},{"name":"regex-[Hh]ello","type":"regex","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output matches pattern /[Hh]ello/","passed":true}]}],"assertions":[{"text":"Output contains \"Hello\"","passed":true,"evidence":"contains-Hello: Output contains \"Hello\" | regex-[Hh]ello: Output matches pattern /[Hh]ello/"},{"text":"Output matches pattern /[Hh]ello/","passed":true}]}
{"timestamp":"2026-02-20T21:38:01.114Z","test_id":"json-response","dataset":"dataset","score":1,"target":"default","scores":[{"name":"is_json","type":"is-json","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output is valid JSON","passed":true}]},{"name":"contains-\"status\"","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"\"status\"\"","passed":true}]},{"name":"contains-\"ok\"","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"\"ok\"\"","passed":true}]}],"assertions":[{"text":"Output is valid JSON","passed":true,"evidence":"is_json: Output is valid JSON | contains-\"status\": Output contains \"\"status\"\" | contains-\"ok\": Output contains \"\"ok\"\""},{"text":"Output contains \"\"status\"\"","passed":true},{"text":"Output contains \"\"ok\"\"","passed":true}]}
{"timestamp":"2026-02-20T21:38:01.516Z","test_id":"equals-check","dataset":"dataset","score":1,"target":"default","scores":[{"name":"equals-4","type":"equals","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output equals \"4\"","passed":true}]}],"assertions":[{"text":"Output equals \"4\"","passed":true,"evidence":"equals-4: Output equals \"4\""}]}
{"timestamp":"2026-02-20T21:38:00.970Z","test_id":"regex-check","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"regex-Good (morning|afternoon|evenin","type":"regex","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output matches pattern /Good (morning|afternoon|evening)/","passed":true}]}],"assertions":[{"text":"Output matches pattern /Good (morning|afternoon|evening)/","passed":true,"evidence":"regex-Good (morning|afternoon|evenin: Output matches pattern /Good (morning|afternoon|evening)/"}]}
{"timestamp":"2026-02-20T21:38:00.975Z","test_id":"contains-check","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"contains-Hello","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"Hello\"","passed":true}]},{"name":"regex-[Hh]ello","type":"regex","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output matches pattern /[Hh]ello/","passed":true}]}],"assertions":[{"text":"Output contains \"Hello\"","passed":true,"evidence":"contains-Hello: Output contains \"Hello\" | regex-[Hh]ello: Output matches pattern /[Hh]ello/"},{"text":"Output matches pattern /[Hh]ello/","passed":true}]}
{"timestamp":"2026-02-20T21:38:01.114Z","test_id":"json-response","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"is_json","type":"is-json","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output is valid JSON","passed":true}]},{"name":"contains-\"status\"","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"\"status\"\"","passed":true}]},{"name":"contains-\"ok\"","type":"contains","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output contains \"\"ok\"\"","passed":true}]}],"assertions":[{"text":"Output is valid JSON","passed":true,"evidence":"is_json: Output is valid JSON | contains-\"status\": Output contains \"\"status\"\" | contains-\"ok\": Output contains \"\"ok\"\""},{"text":"Output contains \"\"status\"\"","passed":true},{"text":"Output contains \"\"ok\"\"","passed":true}]}
{"timestamp":"2026-02-20T21:38:01.516Z","test_id":"equals-check","eval_set":"dataset","score":1,"target":"default","scores":[{"name":"equals-4","type":"equals","score":1,"weight":1,"verdict":"pass","assertions":[{"text":"Output equals \"4\"","passed":true}]}],"assertions":[{"text":"Output equals \"4\"","passed":true,"evidence":"equals-4: Output equals \"4\""}]}
Loading