diff --git a/pyproject.toml b/pyproject.toml index cd82fc4..76e83d4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,9 @@ streaming = [ "opentelemetry-sdk>=1.20.0", "websockets>=12.0", ] +openai = [ + "openai>=2.0", +] [project.scripts] agentevals = "agentevals.cli:main" diff --git a/src/agentevals/api/routes.py b/src/agentevals/api/routes.py index 628f2c6..c128300 100644 --- a/src/agentevals/api/routes.py +++ b/src/agentevals/api/routes.py @@ -22,6 +22,7 @@ CodeEvaluatorDef, CustomEvaluatorDef, EvalRunConfig, + OpenAIEvalDef, ) from ..extraction import get_extractor from ..runner import RunResult, get_loader, load_eval_set, run_evaluation @@ -58,6 +59,7 @@ def _camel_keys(obj: Any) -> Any: _TYPE_TO_MODEL = { "builtin": BuiltinMetricDef, "code": CodeEvaluatorDef, + "openai_eval": OpenAIEvalDef, } diff --git a/src/agentevals/config.py b/src/agentevals/config.py index c3f99d9..3278c19 100644 --- a/src/agentevals/config.py +++ b/src/agentevals/config.py @@ -53,8 +53,48 @@ class RemoteEvaluatorDef(BaseEvaluatorDef): ref: str = Field(description="Source-specific reference (e.g. path within the repo).") +_VALID_SIMILARITY_METRICS = frozenset( + { + "fuzzy_match", + "bleu", + "gleu", + "meteor", + "cosine", + "rouge_1", + "rouge_2", + "rouge_3", + "rouge_4", + "rouge_5", + "rouge_l", + } +) + + +class OpenAIEvalDef(BaseModel): + """An evaluator that delegates grading to the OpenAI Evals API.""" + + type: Literal["openai_eval"] = "openai_eval" + name: str + threshold: float = 0.5 + timeout: int = Field(default=120, description="Max seconds to wait for the OpenAI eval run to complete.") + grader: dict[str, Any] = Field(description="OpenAI grader config passed to testing_criteria.") + + @field_validator("grader") + @classmethod + def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]: + grader_type = v.get("type") + if grader_type != "text_similarity": + raise ValueError(f"Only 'text_similarity' grader type is currently supported, got '{grader_type}'") + metric = v.get("evaluation_metric") + if not metric: + raise ValueError("'evaluation_metric' is required for text_similarity grader") + if metric not in _VALID_SIMILARITY_METRICS: + raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}") + return v + + CustomEvaluatorDef = Annotated[ - BuiltinMetricDef | CodeEvaluatorDef | RemoteEvaluatorDef, + BuiltinMetricDef | CodeEvaluatorDef | RemoteEvaluatorDef | OpenAIEvalDef, Field(discriminator="type"), ] diff --git a/src/agentevals/custom_evaluators.py b/src/agentevals/custom_evaluators.py index 785af73..47bee97 100644 --- a/src/agentevals/custom_evaluators.py +++ b/src/agentevals/custom_evaluators.py @@ -425,9 +425,14 @@ async def evaluate_custom_evaluator( """ import inspect as _inspect - from .config import CodeEvaluatorDef, RemoteEvaluatorDef + from .config import CodeEvaluatorDef, OpenAIEvalDef, RemoteEvaluatorDef from .runner import MetricResult + if isinstance(evaluator_def, OpenAIEvalDef): + from .openai_eval_backend import evaluate_openai_eval + + return await evaluate_openai_eval(evaluator_def, actual_invocations, expected_invocations) + if isinstance(evaluator_def, RemoteEvaluatorDef): from .evaluator.resolver import get_default_resolver diff --git a/src/agentevals/eval_config_loader.py b/src/agentevals/eval_config_loader.py index 9bdc9f4..5ddc7cc 100644 --- a/src/agentevals/eval_config_loader.py +++ b/src/agentevals/eval_config_loader.py @@ -13,6 +13,7 @@ CodeEvaluatorDef, CustomEvaluatorDef, EvalRunConfig, + OpenAIEvalDef, RemoteEvaluatorDef, ) @@ -22,6 +23,7 @@ "builtin": BuiltinMetricDef, "code": CodeEvaluatorDef, "remote": RemoteEvaluatorDef, + "openai_eval": OpenAIEvalDef, } @@ -42,7 +44,7 @@ def _parse_evaluator_entry(entry: dict[str, Any]) -> tuple[str | None, CustomEva evaluator_type = entry.get("type") if not evaluator_type: - raise ValueError(f"Evaluator entry '{name}' must have a 'type' field (builtin, code, or remote)") + raise ValueError(f"Evaluator entry '{name}' must have a 'type' field ({', '.join(_TYPE_TO_MODEL)})") if evaluator_type not in _TYPE_TO_MODEL: raise ValueError( diff --git a/src/agentevals/openai_eval_backend.py b/src/agentevals/openai_eval_backend.py new file mode 100644 index 0000000..a6e9c00 --- /dev/null +++ b/src/agentevals/openai_eval_backend.py @@ -0,0 +1,246 @@ +"""OpenAI Evals API backend — delegates grading to the OpenAI Evals API. + +Builds testing criteria from the evaluator config, submits invocation pairs +as JSONL items, polls for completion, and maps per-item results back to a +MetricResult. +""" + +from __future__ import annotations + +import asyncio +import logging +import os +import time +from typing import Any + +from google.adk.evaluation.eval_case import Invocation + +from .config import OpenAIEvalDef +from .custom_evaluators import _content_to_text + +logger = logging.getLogger(__name__) + +_POLL_INTERVAL_SECONDS = 2 + +_TEXT_PAIR_SCHEMA = { + "type": "object", + "properties": { + "actual_response": {"type": "string"}, + "expected_response": {"type": "string"}, + }, + "required": ["actual_response", "expected_response"], +} + + +def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]: + """Build the OpenAI testing_criteria dict from the evaluator config. + + Each grader type produces a different shape. Extend this function + when adding support for new OpenAI grader types. + """ + grader = evaluator_def.grader + grader_type = grader["type"] + + if grader_type == "text_similarity": + return { + "type": "text_similarity", + "name": evaluator_def.name, + "input": "{{ item.actual_response }}", + "reference": "{{ item.expected_response }}", + "evaluation_metric": grader["evaluation_metric"], + "pass_threshold": evaluator_def.threshold, + } + + raise ValueError(f"Unsupported grader type: {grader_type}") + + +def _build_jsonl_items( + actual_invocations: list[Invocation], + expected_invocations: list[Invocation], +) -> list[dict[str, Any]]: + items = [] + for i, actual_inv in enumerate(actual_invocations): + actual_text = _content_to_text(actual_inv.final_response) + if i < len(expected_invocations): + expected_text = _content_to_text(expected_invocations[i].final_response) + else: + expected_text = "" + items.append( + { + "item": { + "actual_response": actual_text, + "expected_response": expected_text, + } + } + ) + return items + + +def _get_openai_client(): + try: + from openai import OpenAI + except ImportError as exc: + raise ImportError( + "The 'openai' package is required for openai_eval evaluators. " + "Install it with: pip install 'agentevals-cli[openai]'" + ) from exc + return OpenAI() + + +def _extract_item_score(output_item: Any) -> float | None: + results = getattr(output_item, "results", None) + if not results: + return None + for r in results: + if getattr(r, "score", None) is not None: + return float(r.score) + return None + + +async def evaluate_openai_eval( + evaluator_def: OpenAIEvalDef, + actual_invocations: list[Invocation], + expected_invocations: list[Invocation] | None, +) -> Any: + """Run an evaluation via the OpenAI Evals API and return a MetricResult.""" + from .runner import MetricResult + + if not os.environ.get("OPENAI_API_KEY"): + return MetricResult( + metric_name=evaluator_def.name, + error="OPENAI_API_KEY environment variable is not set.", + ) + + if expected_invocations is None: + return MetricResult( + metric_name=evaluator_def.name, + error="OpenAI text_similarity grader requires expected invocations (golden eval set).", + ) + + items = _build_jsonl_items(actual_invocations, expected_invocations) + if not items: + return MetricResult( + metric_name=evaluator_def.name, + error="No invocations to evaluate.", + ) + + testing_criteria = _build_testing_criteria(evaluator_def) + eval_id = None + + try: + client = await asyncio.to_thread(_get_openai_client) + + eval_obj = await asyncio.to_thread( + client.evals.create, + name=f"agentevals-{evaluator_def.name}", + data_source_config={ + "type": "custom", + "item_schema": _TEXT_PAIR_SCHEMA, + "include_sample_schema": False, + }, + testing_criteria=[testing_criteria], + ) + eval_id = eval_obj.id + logger.info("Created OpenAI eval %s for '%s'", eval_id, evaluator_def.name) + + run = await asyncio.to_thread( + client.evals.runs.create, + eval_id=eval_id, + name=f"agentevals-run-{evaluator_def.name}", + data_source={ + "type": "jsonl", + "source": { + "type": "file_content", + "content": items, + }, + }, + ) + run_id = run.id + logger.info("Created OpenAI eval run %s", run_id) + + run = await _poll_run(client, eval_id, run_id, evaluator_def) + if isinstance(run, MetricResult): + return run + + return await _collect_results(client, eval_id, run_id, run, evaluator_def) + + except ImportError: + raise + except Exception as exc: + logger.exception("OpenAI eval failed for '%s'", evaluator_def.name) + return MetricResult( + metric_name=evaluator_def.name, + error=f"OpenAI Evals API error: {exc}", + ) + finally: + if eval_id: + try: + await asyncio.to_thread(client.evals.delete, eval_id) + logger.debug("Cleaned up OpenAI eval %s", eval_id) + except Exception: + logger.debug("Failed to clean up OpenAI eval %s", eval_id, exc_info=True) + + +async def _poll_run(client: Any, eval_id: str, run_id: str, evaluator_def: OpenAIEvalDef) -> Any: + """Poll until the run completes. Returns the run object, or a MetricResult on error/timeout.""" + from .runner import MetricResult + + start_time = time.monotonic() + while True: + elapsed = time.monotonic() - start_time + if elapsed > evaluator_def.timeout: + return MetricResult( + metric_name=evaluator_def.name, + error=f"OpenAI eval run timed out after {evaluator_def.timeout}s.", + ) + + run = await asyncio.to_thread(client.evals.runs.retrieve, run_id, eval_id=eval_id) + + if run.status == "completed": + return run + if run.status in ("failed", "canceled"): + return MetricResult( + metric_name=evaluator_def.name, + error=f"OpenAI eval run {run.status}: {getattr(run, 'error', 'unknown')}", + ) + + await asyncio.sleep(_POLL_INTERVAL_SECONDS) + + +async def _collect_results(client: Any, eval_id: str, run_id: str, run: Any, evaluator_def: OpenAIEvalDef) -> Any: + """Extract scores from a completed run and return a MetricResult.""" + from .runner import MetricResult + + output_items_page = await asyncio.to_thread(client.evals.runs.output_items.list, run_id=run_id, eval_id=eval_id) + output_items = list(output_items_page.data) if output_items_page.data else [] + + per_invocation_scores: list[float | None] = [_extract_item_score(item) for item in output_items] + + valid_scores = [s for s in per_invocation_scores if s is not None] + overall_score = sum(valid_scores) / len(valid_scores) if valid_scores else 0.0 + + result_counts = run.result_counts + passed = result_counts.passed if result_counts else 0 + failed = result_counts.failed if result_counts else 0 + total = result_counts.total if result_counts else 0 + eval_status = "PASSED" if failed == 0 and total > 0 else "FAILED" + + details: dict[str, Any] = { + "openai_eval_id": eval_id, + "openai_run_id": run_id, + "evaluation_metric": evaluator_def.grader.get("evaluation_metric"), + "result_counts": {"passed": passed, "failed": failed, "total": total}, + } + per_criteria = getattr(run, "per_testing_criteria_results", None) + if per_criteria: + details["per_testing_criteria"] = [ + {"name": c.testing_criteria, "passed": c.passed, "failed": c.failed} for c in per_criteria + ] + + return MetricResult( + metric_name=evaluator_def.name, + score=overall_score, + eval_status=eval_status, + per_invocation_scores=per_invocation_scores, + details=details, + ) diff --git a/uv.lock b/uv.lock index 71c1f2e..8e8ebf3 100644 --- a/uv.lock +++ b/uv.lock @@ -50,6 +50,9 @@ live = [ { name = "httpx" }, { name = "mcp" }, ] +openai = [ + { name = "openai" }, +] streaming = [ { name = "opentelemetry-sdk" }, { name = "websockets" }, @@ -71,6 +74,7 @@ requires-dist = [ { name = "httpx", specifier = ">=0.27.0" }, { name = "httpx", marker = "extra == 'live'", specifier = ">=0.27.0" }, { name = "mcp", marker = "extra == 'live'", specifier = ">=1.26.0" }, + { name = "openai", marker = "extra == 'openai'", specifier = ">=2.0" }, { name = "opentelemetry-proto", specifier = ">=1.36.0" }, { name = "opentelemetry-sdk", marker = "extra == 'streaming'", specifier = ">=1.20.0" }, { name = "python-dotenv", specifier = ">=1.0.0" }, @@ -80,7 +84,7 @@ requires-dist = [ { name = "uvicorn", extras = ["standard"], specifier = ">=0.32.0" }, { name = "websockets", marker = "extra == 'streaming'", specifier = ">=12.0" }, ] -provides-extras = ["live", "streaming"] +provides-extras = ["live", "streaming", "openai"] [package.metadata.requires-dev] dev = [