agentevals-dev · EItanya · Mar 30, 2026 · Mar 30, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -30,6 +30,9 @@ streaming = [
     "opentelemetry-sdk>=1.20.0",
     "websockets>=12.0",
 ]
+openai = [
+    "openai>=2.0",
+]
 
 [project.scripts]
 agentevals = "agentevals.cli:main"

diff --git a/src/agentevals/api/routes.py b/src/agentevals/api/routes.py
@@ -22,6 +22,7 @@
     CodeEvaluatorDef,
     CustomEvaluatorDef,
     EvalRunConfig,
+    OpenAIEvalDef,
 )
 from ..extraction import get_extractor
 from ..runner import RunResult, get_loader, load_eval_set, run_evaluation
@@ -58,6 +59,7 @@ def _camel_keys(obj: Any) -> Any:
 _TYPE_TO_MODEL = {
     "builtin": BuiltinMetricDef,
     "code": CodeEvaluatorDef,
+    "openai_eval": OpenAIEvalDef,
 }
 
 

diff --git a/src/agentevals/config.py b/src/agentevals/config.py
@@ -53,8 +53,48 @@ class RemoteEvaluatorDef(BaseEvaluatorDef):
     ref: str = Field(description="Source-specific reference (e.g. path within the repo).")
 
 
+_VALID_SIMILARITY_METRICS = frozenset(
+    {
+        "fuzzy_match",
+        "bleu",
+        "gleu",
+        "meteor",
+        "cosine",
+        "rouge_1",
+        "rouge_2",
+        "rouge_3",
+        "rouge_4",
+        "rouge_5",
+        "rouge_l",
+    }
+)
+
+
+class OpenAIEvalDef(BaseModel):
+    """An evaluator that delegates grading to the OpenAI Evals API."""
+
+    type: Literal["openai_eval"] = "openai_eval"
+    name: str
+    threshold: float = 0.5
+    timeout: int = Field(default=120, description="Max seconds to wait for the OpenAI eval run to complete.")
+    grader: dict[str, Any] = Field(description="OpenAI grader config passed to testing_criteria.")
+
+    @field_validator("grader")
+    @classmethod
+    def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]:
+        grader_type = v.get("type")
+        if grader_type != "text_similarity":
+            raise ValueError(f"Only 'text_similarity' grader type is currently supported, got '{grader_type}'")
+        metric = v.get("evaluation_metric")
+        if not metric:
+            raise ValueError("'evaluation_metric' is required for text_similarity grader")
+        if metric not in _VALID_SIMILARITY_METRICS:
+            raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}")
+        return v
+
+
 CustomEvaluatorDef = Annotated[
-    BuiltinMetricDef | CodeEvaluatorDef | RemoteEvaluatorDef,
+    BuiltinMetricDef | CodeEvaluatorDef | RemoteEvaluatorDef | OpenAIEvalDef,
     Field(discriminator="type"),
 ]
 

diff --git a/src/agentevals/custom_evaluators.py b/src/agentevals/custom_evaluators.py
@@ -425,9 +425,14 @@ async def evaluate_custom_evaluator(
     """
     import inspect as _inspect
 
-    from .config import CodeEvaluatorDef, RemoteEvaluatorDef
+    from .config import CodeEvaluatorDef, OpenAIEvalDef, RemoteEvaluatorDef
     from .runner import MetricResult
 
+    if isinstance(evaluator_def, OpenAIEvalDef):
+        from .openai_eval_backend import evaluate_openai_eval
+
+        return await evaluate_openai_eval(evaluator_def, actual_invocations, expected_invocations)
+
     if isinstance(evaluator_def, RemoteEvaluatorDef):
         from .evaluator.resolver import get_default_resolver
 

diff --git a/src/agentevals/eval_config_loader.py b/src/agentevals/eval_config_loader.py
@@ -13,6 +13,7 @@
     CodeEvaluatorDef,
     CustomEvaluatorDef,
     EvalRunConfig,
+    OpenAIEvalDef,
     RemoteEvaluatorDef,
 )
 
@@ -22,6 +23,7 @@
     "builtin": BuiltinMetricDef,
     "code": CodeEvaluatorDef,
     "remote": RemoteEvaluatorDef,
+    "openai_eval": OpenAIEvalDef,
 }
 
 
@@ -42,7 +44,7 @@ def _parse_evaluator_entry(entry: dict[str, Any]) -> tuple[str | None, CustomEva
 
     evaluator_type = entry.get("type")
     if not evaluator_type:
-        raise ValueError(f"Evaluator entry '{name}' must have a 'type' field (builtin, code, or remote)")
+        raise ValueError(f"Evaluator entry '{name}' must have a 'type' field ({', '.join(_TYPE_TO_MODEL)})")
 
     if evaluator_type not in _TYPE_TO_MODEL:
         raise ValueError(

diff --git a/src/agentevals/openai_eval_backend.py b/src/agentevals/openai_eval_backend.py
@@ -0,0 +1,246 @@
+"""OpenAI Evals API backend — delegates grading to the OpenAI Evals API.
+
+Builds testing criteria from the evaluator config, submits invocation pairs
+as JSONL items, polls for completion, and maps per-item results back to a
+MetricResult.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import os
+import time
+from typing import Any
+
+from google.adk.evaluation.eval_case import Invocation
+
+from .config import OpenAIEvalDef
+from .custom_evaluators import _content_to_text
+
+logger = logging.getLogger(__name__)
+
+_POLL_INTERVAL_SECONDS = 2
+
+_TEXT_PAIR_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "actual_response": {"type": "string"},
+        "expected_response": {"type": "string"},
+    },
+    "required": ["actual_response", "expected_response"],
+}
+
+
+def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
+    """Build the OpenAI testing_criteria dict from the evaluator config.
+
+    Each grader type produces a different shape.  Extend this function
+    when adding support for new OpenAI grader types.
+    """
+    grader = evaluator_def.grader
+    grader_type = grader["type"]
+
+    if grader_type == "text_similarity":
+        return {
+            "type": "text_similarity",
+            "name": evaluator_def.name,
+            "input": "{{ item.actual_response }}",
+            "reference": "{{ item.expected_response }}",
+            "evaluation_metric": grader["evaluation_metric"],
+            "pass_threshold": evaluator_def.threshold,
+        }
+
+    raise ValueError(f"Unsupported grader type: {grader_type}")
+
+
+def _build_jsonl_items(
+    actual_invocations: list[Invocation],
+    expected_invocations: list[Invocation],
+) -> list[dict[str, Any]]:
+    items = []
+    for i, actual_inv in enumerate(actual_invocations):
+        actual_text = _content_to_text(actual_inv.final_response)
+        if i < len(expected_invocations):
+            expected_text = _content_to_text(expected_invocations[i].final_response)
+        else:
+            expected_text = ""
+        items.append(
+            {
+                "item": {
+                    "actual_response": actual_text,
+                    "expected_response": expected_text,
+                }
+            }
+        )
+    return items
+
+
+def _get_openai_client():
+    try:
+        from openai import OpenAI
+    except ImportError as exc:
+        raise ImportError(
+            "The 'openai' package is required for openai_eval evaluators. "
+            "Install it with: pip install 'agentevals-cli[openai]'"
+        ) from exc
+    return OpenAI()
+
+
+def _extract_item_score(output_item: Any) -> float | None:
+    results = getattr(output_item, "results", None)
+    if not results:
+        return None
+    for r in results:
+        if getattr(r, "score", None) is not None:
+            return float(r.score)
+    return None
+
+
+async def evaluate_openai_eval(
+    evaluator_def: OpenAIEvalDef,
+    actual_invocations: list[Invocation],
+    expected_invocations: list[Invocation] | None,
+) -> Any:
+    """Run an evaluation via the OpenAI Evals API and return a MetricResult."""
+    from .runner import MetricResult
+
+    if not os.environ.get("OPENAI_API_KEY"):
+        return MetricResult(
+            metric_name=evaluator_def.name,
+            error="OPENAI_API_KEY environment variable is not set.",
+        )
+
+    if expected_invocations is None:
+        return MetricResult(
+            metric_name=evaluator_def.name,
+            error="OpenAI text_similarity grader requires expected invocations (golden eval set).",
+        )
+
+    items = _build_jsonl_items(actual_invocations, expected_invocations)
+    if not items:
+        return MetricResult(
+            metric_name=evaluator_def.name,
+            error="No invocations to evaluate.",
+        )
+
+    testing_criteria = _build_testing_criteria(evaluator_def)
+    eval_id = None
+
+    try:
+        client = await asyncio.to_thread(_get_openai_client)
+
+        eval_obj = await asyncio.to_thread(
+            client.evals.create,
+            name=f"agentevals-{evaluator_def.name}",
+            data_source_config={
+                "type": "custom",
+                "item_schema": _TEXT_PAIR_SCHEMA,
+                "include_sample_schema": False,
+            },
+            testing_criteria=[testing_criteria],
+        )
+        eval_id = eval_obj.id
+        logger.info("Created OpenAI eval %s for '%s'", eval_id, evaluator_def.name)
+
+        run = await asyncio.to_thread(
+            client.evals.runs.create,
+            eval_id=eval_id,
+            name=f"agentevals-run-{evaluator_def.name}",
+            data_source={
+                "type": "jsonl",
+                "source": {
+                    "type": "file_content",
+                    "content": items,
+                },
+            },
+        )
+        run_id = run.id
+        logger.info("Created OpenAI eval run %s", run_id)
+
+        run = await _poll_run(client, eval_id, run_id, evaluator_def)
+        if isinstance(run, MetricResult):
+            return run
+
+        return await _collect_results(client, eval_id, run_id, run, evaluator_def)
+
+    except ImportError:
+        raise
+    except Exception as exc:
+        logger.exception("OpenAI eval failed for '%s'", evaluator_def.name)
+        return MetricResult(
+            metric_name=evaluator_def.name,
+            error=f"OpenAI Evals API error: {exc}",
+        )
+    finally:
+        if eval_id:
+            try:
+                await asyncio.to_thread(client.evals.delete, eval_id)
+                logger.debug("Cleaned up OpenAI eval %s", eval_id)
+            except Exception:
+                logger.debug("Failed to clean up OpenAI eval %s", eval_id, exc_info=True)
+
+
+async def _poll_run(client: Any, eval_id: str, run_id: str, evaluator_def: OpenAIEvalDef) -> Any:
+    """Poll until the run completes. Returns the run object, or a MetricResult on error/timeout."""
+    from .runner import MetricResult
+
+    start_time = time.monotonic()
+    while True:
+        elapsed = time.monotonic() - start_time
+        if elapsed > evaluator_def.timeout:
+            return MetricResult(
+                metric_name=evaluator_def.name,
+                error=f"OpenAI eval run timed out after {evaluator_def.timeout}s.",
+            )
+
+        run = await asyncio.to_thread(client.evals.runs.retrieve, run_id, eval_id=eval_id)
+
+        if run.status == "completed":
+            return run
+        if run.status in ("failed", "canceled"):
+            return MetricResult(
+                metric_name=evaluator_def.name,
+                error=f"OpenAI eval run {run.status}: {getattr(run, 'error', 'unknown')}",
+            )
+
+        await asyncio.sleep(_POLL_INTERVAL_SECONDS)
+
+
+async def _collect_results(client: Any, eval_id: str, run_id: str, run: Any, evaluator_def: OpenAIEvalDef) -> Any:
+    """Extract scores from a completed run and return a MetricResult."""
+    from .runner import MetricResult
+
+    output_items_page = await asyncio.to_thread(client.evals.runs.output_items.list, run_id=run_id, eval_id=eval_id)
+    output_items = list(output_items_page.data) if output_items_page.data else []
+
+    per_invocation_scores: list[float | None] = [_extract_item_score(item) for item in output_items]
+
+    valid_scores = [s for s in per_invocation_scores if s is not None]
+    overall_score = sum(valid_scores) / len(valid_scores) if valid_scores else 0.0
+
+    result_counts = run.result_counts
+    passed = result_counts.passed if result_counts else 0
+    failed = result_counts.failed if result_counts else 0
+    total = result_counts.total if result_counts else 0
+    eval_status = "PASSED" if failed == 0 and total > 0 else "FAILED"
+
+    details: dict[str, Any] = {
+        "openai_eval_id": eval_id,
+        "openai_run_id": run_id,
+        "evaluation_metric": evaluator_def.grader.get("evaluation_metric"),
+        "result_counts": {"passed": passed, "failed": failed, "total": total},
+    }
+    per_criteria = getattr(run, "per_testing_criteria_results", None)
+    if per_criteria:
+        details["per_testing_criteria"] = [
+            {"name": c.testing_criteria, "passed": c.passed, "failed": c.failed} for c in per_criteria
+        ]
+
+    return MetricResult(
+        metric_name=evaluator_def.name,
+        score=overall_score,
+        eval_status=eval_status,
+        per_invocation_scores=per_invocation_scores,
+        details=details,
+    )
diff --git a/uv.lock b/uv.lock