Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 106 additions & 0 deletions examples/features/trial-output-consistency/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# Trial Output Consistency Metric

Measures how consistent an agent's outputs are across repeated trials using pairwise cosine similarity.

## What It Measures

When an agent is run multiple times on the same input (trials), outputs may vary due to LLM non-determinism. This metric quantifies that variation:

- **Score 1.0** — all trial outputs are identical/semantically equivalent
- **Score ~0.8+** — high consistency (minor wording differences)
- **Score ~0.5** — moderate consistency (different phrasing, same topic)
- **Score <0.5** — low consistency (substantially different outputs)

## How It Works

1. Receives an array of trial outputs via `config.trialOutputs`
2. Computes a vector representation for each output (embedding or token-overlap)
3. Calculates pairwise cosine similarity for all output pairs
4. Returns the average as the consistency score

### Similarity Methods

| Method | When Used | Accuracy |
|--------|-----------|----------|
| **Embedding** | Target client available, `fallback` not set | High — captures semantic similarity |
| **Token-overlap** | No target or `fallback: token` | Moderate — bag-of-words cosine |

## Edge Cases

| Condition | Score | Reasoning |
|-----------|-------|-----------|
| 0 trials | 0 | Cannot compute — reported as miss |
| 1 trial | 1.0 | Perfect consistency by definition |
| 2+ trials | 0–1 | Average pairwise cosine similarity |
| Identical outputs | 1.0 | Maximum similarity |
| Empty strings | 0 | Zero vectors produce 0 similarity |

## Usage

### Eval YAML

```yaml
assert:
- name: trial-consistency
type: code_judge
command: ["bun", "run", "../judges/trial-consistency.ts"]
config:
trialOutputs:
- "Output from trial 1"
- "Output from trial 2"
- "Output from trial 3"
```

### Config Options

| Field | Type | Required | Description |
|-------|------|----------|-------------|
| `trialOutputs` | `string[]` | Yes | Array of outputs from repeated trials |
| `fallback` | `"token"` | No | Force token-overlap mode (skip embedding) |

### Running

```bash
# Run all tests (uses token-overlap fallback for demo)
bun agentv eval examples/features/trial-output-consistency/evals/dataset.eval.yaml --dry-run

# Run a specific test
bun agentv eval examples/features/trial-output-consistency/evals/dataset.eval.yaml --test-id high-consistency --dry-run
```

## Extending

### Custom Embedding Provider

Replace `getEmbeddings()` in `judges/trial-consistency.ts` with your preferred embedding API. The judge expects vectors as `number[][]` — any embedding dimension works.

### Integration with Trial Execution

In a production workflow, pipe actual trial outputs into the `trialOutputs` config array. Example with a wrapper script:

```typescript
import { execSync } from 'child_process';

// Run N trials and collect outputs
const outputs = Array.from({ length: 5 }, () =>
execSync('bun agentv eval ... --json').toString()
);

// Pass to consistency judge via config
const config = { trialOutputs: outputs };
```

### Threshold-Based Pass/Fail

Wrap the judge in an assertion that enforces a minimum consistency threshold:

```yaml
assert:
- name: trial-consistency
type: code_judge
command: ["bun", "run", "../judges/trial-consistency.ts"]
config:
trialOutputs: [...]
```

Check `score >= 0.8` in the results to enforce high consistency.
99 changes: 99 additions & 0 deletions examples/features/trial-output-consistency/evals/dataset.eval.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# Trial Output Consistency Evaluator
# Measures how consistent an agent's outputs are across repeated trials
# using pairwise cosine similarity (embedding-based or token-overlap fallback).
#
# Run:
# bun agentv eval examples/features/trial-output-consistency/evals/dataset.eval.yaml --dry-run

description: Trial output consistency via embedding similarity

execution:
target: default

tests:
# ── High consistency: semantically identical outputs ──────────────
- id: high-consistency
criteria: Agent produces consistent answers across trials.

input:
- role: user
content: What is the capital of France?

expected_output:
- role: assistant
content: The capital of France is Paris.

assert:
- name: trial-consistency
type: code_judge
command: ["bun", "run", "../judges/trial-consistency.ts"]
config:
fallback: token
trialOutputs:
- "The capital of France is Paris."
- "Paris is the capital of France."
- "France's capital city is Paris."

# ── Low consistency: divergent outputs ────────────────────────────
- id: low-consistency
criteria: Agent produces inconsistent answers across trials.

input:
- role: user
content: Write a creative tagline for a coffee shop.

expected_output:
- role: assistant
content: Any creative tagline.

assert:
- name: trial-consistency
type: code_judge
command: ["bun", "run", "../judges/trial-consistency.ts"]
config:
fallback: token
trialOutputs:
- "Wake up and smell the magic."
- "Brewed with love, served with a smile."
- "Where every cup tells a story."

# ── Edge case: single trial ──────────────────────────────────────
- id: single-trial
criteria: Single trial returns perfect consistency.

input:
- role: user
content: What is 2 + 2?

expected_output:
- role: assistant
content: "4"

assert:
- name: trial-consistency
type: code_judge
command: ["bun", "run", "../judges/trial-consistency.ts"]
config:
fallback: token
trialOutputs:
- "The answer is 4."

# ── Edge case: zero trials ──────────────────────────────────────
- id: zero-trials
criteria: Zero trials returns an error score.

input:
- role: user
content: Hello

expected_output:
- role: assistant
content: Hi

assert:
- name: trial-consistency
type: code_judge
command: ["bun", "run", "../judges/trial-consistency.ts"]
config:
fallback: token
trialOutputs: []
184 changes: 184 additions & 0 deletions examples/features/trial-output-consistency/judges/trial-consistency.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
#!/usr/bin/env bun
/**
* Trial Output Consistency Judge
*
* Computes consistency across repeated trial outputs using embedding similarity.
* Uses the Vercel AI SDK for embeddings via AgentV's target client, with a
* token-overlap cosine similarity fallback when embeddings are unavailable.
*
* Config:
* trialOutputs: string[] — array of outputs from repeated trials
* fallback?: "token" — force token-overlap mode (skip embeddings)
*
* Edge cases:
* 0 trials → score 0, miss reported
* 1 trial → score 1.0 (perfect consistency by definition)
* 2+ trials → average pairwise cosine similarity
*/
import { createTargetClient, defineCodeJudge, z } from '@agentv/eval';

const ConfigSchema = z.object({
trialOutputs: z.array(z.string()),
fallback: z.enum(['token']).optional(),
});

// ── Token-overlap cosine similarity (fallback) ──────────────────────────

function tokenize(text: string): string[] {
return text
.toLowerCase()
.replace(/[^\w\s]/g, '')
.split(/\s+/)
.filter(Boolean);
}

function termFrequency(tokens: string[]): Map<string, number> {
const tf = new Map<string, number>();
for (const t of tokens) {
tf.set(t, (tf.get(t) ?? 0) + 1);
}
return tf;
}

function cosineSimilarity(a: number[], b: number[]): number {
let dot = 0;
let normA = 0;
let normB = 0;
for (let i = 0; i < a.length; i++) {
dot += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
const denom = Math.sqrt(normA) * Math.sqrt(normB);
return denom === 0 ? 0 : dot / denom;
}

function tokenVectors(texts: string[]): number[][] {
const allTokens = new Set<string>();
const tfs = texts.map((t) => {
const tokens = tokenize(t);
for (const tok of tokens) allTokens.add(tok);
return termFrequency(tokens);
});
const vocab = [...allTokens];
return tfs.map((tf) => vocab.map((w) => tf.get(w) ?? 0));
}

// ── Embedding via target client ─────────────────────────────────────────

async function getEmbeddings(texts: string[]): Promise<number[][] | null> {
const target = createTargetClient();
if (!target) return null;

try {
const requests = texts.map((text) => ({
question: text,
systemPrompt:
'Return ONLY a JSON array of 64 floating-point numbers representing a semantic embedding of the user message. No explanation.',
}));
const responses = await target.invokeBatch(requests);
const embeddings: number[][] = [];
for (const r of responses) {
const raw = r.rawText ?? '';
const match = raw.match(/\[[\s\S]*\]/);
if (!match) return null;
const parsed = JSON.parse(match[0]);
if (!Array.isArray(parsed) || parsed.length === 0) return null;
embeddings.push(parsed.map(Number));
}
return embeddings;
} catch {
return null;
}
}

// ── Pairwise average similarity ─────────────────────────────────────────

function averagePairwiseSimilarity(vectors: number[][]): number {
const n = vectors.length;
if (n < 2) return 1;
let sum = 0;
let count = 0;
for (let i = 0; i < n; i++) {
for (let j = i + 1; j < n; j++) {
sum += cosineSimilarity(vectors[i], vectors[j]);
count++;
}
}
return sum / count;
}

// ── Judge ───────────────────────────────────────────────────────────────

export default defineCodeJudge(async (input) => {
const parsed = ConfigSchema.safeParse(input.config ?? {});
if (!parsed.success) {
return {
score: 0,
hits: [],
misses: ['Invalid config: trialOutputs (string[]) is required'],
reasoning: `Config validation failed: ${parsed.error.message}`,
};
}

const { trialOutputs, fallback } = parsed.data;

// Edge case: 0 trials
if (trialOutputs.length === 0) {
return {
score: 0,
hits: [],
misses: ['No trial outputs provided (0 trials)'],
reasoning: 'Cannot compute consistency with 0 trials.',
};
}

// Edge case: 1 trial
if (trialOutputs.length === 1) {
return {
score: 1,
hits: ['Single trial — perfect consistency by definition'],
misses: [],
reasoning: 'Only one trial output; consistency is trivially 1.0.',
details: { trialCount: 1, method: 'trivial' },
};
}

// 2+ trials: compute pairwise similarity
let vectors: number[][] | null = null;
let method = 'token-overlap';

if (fallback !== 'token') {
vectors = await getEmbeddings(trialOutputs);
if (vectors) method = 'embedding';
}

if (!vectors) {
vectors = tokenVectors(trialOutputs);
method = 'token-overlap';
}

const score = averagePairwiseSimilarity(vectors);
const hits: string[] = [];
const misses: string[] = [];

if (score >= 0.8) {
hits.push(`High consistency: ${score.toFixed(3)}`);
} else if (score >= 0.5) {
hits.push(`Moderate consistency: ${score.toFixed(3)}`);
} else {
misses.push(`Low consistency: ${score.toFixed(3)}`);
}

return {
score: Math.max(0, Math.min(1, score)),
hits,
misses,
reasoning: `Computed ${method} pairwise cosine similarity across ${trialOutputs.length} trial outputs.`,
details: {
trialCount: trialOutputs.length,
method,
pairCount: (trialOutputs.length * (trialOutputs.length - 1)) / 2,
},
};
});