EntityProcess · christso · Feb 25, 2026 · Feb 25, 2026
diff --git a/examples/features/trial-output-consistency/README.md b/examples/features/trial-output-consistency/README.md
@@ -0,0 +1,106 @@
+# Trial Output Consistency Metric
+
+Measures how consistent an agent's outputs are across repeated trials using pairwise cosine similarity.
+
+## What It Measures
+
+When an agent is run multiple times on the same input (trials), outputs may vary due to LLM non-determinism. This metric quantifies that variation:
+
+- **Score 1.0** — all trial outputs are identical/semantically equivalent
+- **Score ~0.8+** — high consistency (minor wording differences)
+- **Score ~0.5** — moderate consistency (different phrasing, same topic)
+- **Score <0.5** — low consistency (substantially different outputs)
+
+## How It Works
+
+1. Receives an array of trial outputs via `config.trialOutputs`
+2. Computes a vector representation for each output (embedding or token-overlap)
+3. Calculates pairwise cosine similarity for all output pairs
+4. Returns the average as the consistency score
+
+### Similarity Methods
+
+| Method | When Used | Accuracy |
+|--------|-----------|----------|
+| **Embedding** | Target client available, `fallback` not set | High — captures semantic similarity |
+| **Token-overlap** | No target or `fallback: token` | Moderate — bag-of-words cosine |
+
+## Edge Cases
+
+| Condition | Score | Reasoning |
+|-----------|-------|-----------|
+| 0 trials | 0 | Cannot compute — reported as miss |
+| 1 trial | 1.0 | Perfect consistency by definition |
+| 2+ trials | 0–1 | Average pairwise cosine similarity |
+| Identical outputs | 1.0 | Maximum similarity |
+| Empty strings | 0 | Zero vectors produce 0 similarity |
+
+## Usage
+
+### Eval YAML
+
+```yaml
+assert:
+  - name: trial-consistency
+    type: code_judge
+    command: ["bun", "run", "../judges/trial-consistency.ts"]
+    config:
+      trialOutputs:
+        - "Output from trial 1"
+        - "Output from trial 2"
+        - "Output from trial 3"
+```
+
+### Config Options
+
+| Field | Type | Required | Description |
+|-------|------|----------|-------------|
+| `trialOutputs` | `string[]` | Yes | Array of outputs from repeated trials |
+| `fallback` | `"token"` | No | Force token-overlap mode (skip embedding) |
+
+### Running
+
+```bash
+# Run all tests (uses token-overlap fallback for demo)
+bun agentv eval examples/features/trial-output-consistency/evals/dataset.eval.yaml --dry-run
+
+# Run a specific test
+bun agentv eval examples/features/trial-output-consistency/evals/dataset.eval.yaml --test-id high-consistency --dry-run
+```
+
+## Extending
+
+### Custom Embedding Provider
+
+Replace `getEmbeddings()` in `judges/trial-consistency.ts` with your preferred embedding API. The judge expects vectors as `number[][]` — any embedding dimension works.
+
+### Integration with Trial Execution
+
+In a production workflow, pipe actual trial outputs into the `trialOutputs` config array. Example with a wrapper script:
+
+```typescript
+import { execSync } from 'child_process';
+
+// Run N trials and collect outputs
+const outputs = Array.from({ length: 5 }, () =>
+  execSync('bun agentv eval ... --json').toString()
+);
+
+// Pass to consistency judge via config
+const config = { trialOutputs: outputs };
+```
+
+### Threshold-Based Pass/Fail
+
+Wrap the judge in an assertion that enforces a minimum consistency threshold:
+
+```yaml
+assert:
+  - name: trial-consistency
+    type: code_judge
+    command: ["bun", "run", "../judges/trial-consistency.ts"]
+    config:
+      trialOutputs: [...]
+```
+
+Check `score >= 0.8` in the results to enforce high consistency.
diff --git a/examples/features/trial-output-consistency/evals/dataset.eval.yaml b/examples/features/trial-output-consistency/evals/dataset.eval.yaml
@@ -0,0 +1,99 @@
+# Trial Output Consistency Evaluator
+# Measures how consistent an agent's outputs are across repeated trials
+# using pairwise cosine similarity (embedding-based or token-overlap fallback).
+#
+# Run:
+#   bun agentv eval examples/features/trial-output-consistency/evals/dataset.eval.yaml --dry-run
+
+description: Trial output consistency via embedding similarity
+
+execution:
+  target: default
+
+tests:
+  # ── High consistency: semantically identical outputs ──────────────
+  - id: high-consistency
+    criteria: Agent produces consistent answers across trials.
+
+    input:
+      - role: user
+        content: What is the capital of France?
+
+    expected_output:
+      - role: assistant
+        content: The capital of France is Paris.
+
+    assert:
+      - name: trial-consistency
+        type: code_judge
+        command: ["bun", "run", "../judges/trial-consistency.ts"]
+        config:
+          fallback: token
+          trialOutputs:
+            - "The capital of France is Paris."
+            - "Paris is the capital of France."
+            - "France's capital city is Paris."
+
+  # ── Low consistency: divergent outputs ────────────────────────────
+  - id: low-consistency
+    criteria: Agent produces inconsistent answers across trials.
+
+    input:
+      - role: user
+        content: Write a creative tagline for a coffee shop.
+
+    expected_output:
+      - role: assistant
+        content: Any creative tagline.
+
+    assert:
+      - name: trial-consistency
+        type: code_judge
+        command: ["bun", "run", "../judges/trial-consistency.ts"]
+        config:
+          fallback: token
+          trialOutputs:
+            - "Wake up and smell the magic."
+            - "Brewed with love, served with a smile."
+            - "Where every cup tells a story."
+
+  # ── Edge case: single trial ──────────────────────────────────────
+  - id: single-trial
+    criteria: Single trial returns perfect consistency.
+
+    input:
+      - role: user
+        content: What is 2 + 2?
+
+    expected_output:
+      - role: assistant
+        content: "4"
+
+    assert:
+      - name: trial-consistency
+        type: code_judge
+        command: ["bun", "run", "../judges/trial-consistency.ts"]
+        config:
+          fallback: token
+          trialOutputs:
+            - "The answer is 4."
+
+  # ── Edge case: zero trials ──────────────────────────────────────
+  - id: zero-trials
+    criteria: Zero trials returns an error score.
+
+    input:
+      - role: user
+        content: Hello
+
+    expected_output:
+      - role: assistant
+        content: Hi
+
+    assert:
+      - name: trial-consistency
+        type: code_judge
+        command: ["bun", "run", "../judges/trial-consistency.ts"]
+        config:
+          fallback: token
+          trialOutputs: []
diff --git a/examples/features/trial-output-consistency/judges/trial-consistency.ts b/examples/features/trial-output-consistency/judges/trial-consistency.ts
@@ -0,0 +1,184 @@
+#!/usr/bin/env bun
+/**
+ * Trial Output Consistency Judge
+ *
+ * Computes consistency across repeated trial outputs using embedding similarity.
+ * Uses the Vercel AI SDK for embeddings via AgentV's target client, with a
+ * token-overlap cosine similarity fallback when embeddings are unavailable.
+ *
+ * Config:
+ *   trialOutputs: string[]  — array of outputs from repeated trials
+ *   fallback?: "token"      — force token-overlap mode (skip embeddings)
+ *
+ * Edge cases:
+ *   0 trials  → score 0, miss reported
+ *   1 trial   → score 1.0 (perfect consistency by definition)
+ *   2+ trials → average pairwise cosine similarity
+ */
+import { createTargetClient, defineCodeJudge, z } from '@agentv/eval';
+
+const ConfigSchema = z.object({
+  trialOutputs: z.array(z.string()),
+  fallback: z.enum(['token']).optional(),
+});
+
+// ── Token-overlap cosine similarity (fallback) ──────────────────────────
+
+function tokenize(text: string): string[] {
+  return text
+    .toLowerCase()
+    .replace(/[^\w\s]/g, '')
+    .split(/\s+/)
+    .filter(Boolean);
+}
+
+function termFrequency(tokens: string[]): Map<string, number> {
+  const tf = new Map<string, number>();
+  for (const t of tokens) {
+    tf.set(t, (tf.get(t) ?? 0) + 1);
+  }
+  return tf;
+}
+
+function cosineSimilarity(a: number[], b: number[]): number {
+  let dot = 0;
+  let normA = 0;
+  let normB = 0;
+  for (let i = 0; i < a.length; i++) {
+    dot += a[i] * b[i];
+    normA += a[i] * a[i];
+    normB += b[i] * b[i];
+  }
+  const denom = Math.sqrt(normA) * Math.sqrt(normB);
+  return denom === 0 ? 0 : dot / denom;
+}
+
+function tokenVectors(texts: string[]): number[][] {
+  const allTokens = new Set<string>();
+  const tfs = texts.map((t) => {
+    const tokens = tokenize(t);
+    for (const tok of tokens) allTokens.add(tok);
+    return termFrequency(tokens);
+  });
+  const vocab = [...allTokens];
+  return tfs.map((tf) => vocab.map((w) => tf.get(w) ?? 0));
+}
+
+// ── Embedding via target client ─────────────────────────────────────────
+
+async function getEmbeddings(texts: string[]): Promise<number[][] | null> {
+  const target = createTargetClient();
+  if (!target) return null;
+
+  try {
+    const requests = texts.map((text) => ({
+      question: text,
+      systemPrompt:
+        'Return ONLY a JSON array of 64 floating-point numbers representing a semantic embedding of the user message. No explanation.',
+    }));
+    const responses = await target.invokeBatch(requests);
+    const embeddings: number[][] = [];
+    for (const r of responses) {
+      const raw = r.rawText ?? '';
+      const match = raw.match(/\[[\s\S]*\]/);
+      if (!match) return null;
+      const parsed = JSON.parse(match[0]);
+      if (!Array.isArray(parsed) || parsed.length === 0) return null;
+      embeddings.push(parsed.map(Number));
+    }
+    return embeddings;
+  } catch {
+    return null;
+  }
+}
+
+// ── Pairwise average similarity ─────────────────────────────────────────
+
+function averagePairwiseSimilarity(vectors: number[][]): number {
+  const n = vectors.length;
+  if (n < 2) return 1;
+  let sum = 0;
+  let count = 0;
+  for (let i = 0; i < n; i++) {
+    for (let j = i + 1; j < n; j++) {
+      sum += cosineSimilarity(vectors[i], vectors[j]);
+      count++;
+    }
+  }
+  return sum / count;
+}
+
+// ── Judge ───────────────────────────────────────────────────────────────
+
+export default defineCodeJudge(async (input) => {
+  const parsed = ConfigSchema.safeParse(input.config ?? {});
+  if (!parsed.success) {
+    return {
+      score: 0,
+      hits: [],
+      misses: ['Invalid config: trialOutputs (string[]) is required'],
+      reasoning: `Config validation failed: ${parsed.error.message}`,
+    };
+  }
+
+  const { trialOutputs, fallback } = parsed.data;
+
+  // Edge case: 0 trials
+  if (trialOutputs.length === 0) {
+    return {
+      score: 0,
+      hits: [],
+      misses: ['No trial outputs provided (0 trials)'],
+      reasoning: 'Cannot compute consistency with 0 trials.',
+    };
+  }
+
+  // Edge case: 1 trial
+  if (trialOutputs.length === 1) {
+    return {
+      score: 1,
+      hits: ['Single trial — perfect consistency by definition'],
+      misses: [],
+      reasoning: 'Only one trial output; consistency is trivially 1.0.',
+      details: { trialCount: 1, method: 'trivial' },
+    };
+  }
+
+  // 2+ trials: compute pairwise similarity
+  let vectors: number[][] | null = null;
+  let method = 'token-overlap';
+
+  if (fallback !== 'token') {
+    vectors = await getEmbeddings(trialOutputs);
+    if (vectors) method = 'embedding';
+  }
+
+  if (!vectors) {
+    vectors = tokenVectors(trialOutputs);
+    method = 'token-overlap';
+  }
+
+  const score = averagePairwiseSimilarity(vectors);
+  const hits: string[] = [];
+  const misses: string[] = [];
+
+  if (score >= 0.8) {
+    hits.push(`High consistency: ${score.toFixed(3)}`);
+  } else if (score >= 0.5) {
+    hits.push(`Moderate consistency: ${score.toFixed(3)}`);
+  } else {
+    misses.push(`Low consistency: ${score.toFixed(3)}`);
+  }
+
+  return {
+    score: Math.max(0, Math.min(1, score)),
+    hits,
+    misses,
+    reasoning: `Computed ${method} pairwise cosine similarity across ${trialOutputs.length} trial outputs.`,
+    details: {
+      trialCount: trialOutputs.length,
+      method,
+      pairCount: (trialOutputs.length * (trialOutputs.length - 1)) / 2,
+    },
+  };
+});