diff --git a/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/__init__.py b/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/__init__.py index 225e8a8..deec275 100644 --- a/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/__init__.py +++ b/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/__init__.py @@ -24,6 +24,7 @@ def my_evaluator(input: EvalInput) -> EvalResult: from .types import ( EvalInput, EvalResult, + EvalStatus, IntermediateStepData, InvocationData, ToolCallData, @@ -34,6 +35,7 @@ def my_evaluator(input: EvalInput) -> EvalResult: "evaluator", "EvalInput", "EvalResult", + "EvalStatus", "IntermediateStepData", "InvocationData", "ToolCallData", diff --git a/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py b/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py index 101026d..51974b1 100644 --- a/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py +++ b/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py @@ -7,6 +7,7 @@ from __future__ import annotations +from enum import Enum from typing import Any, Optional from pydantic import BaseModel, Field @@ -63,13 +64,21 @@ class EvalInput(BaseModel): expected_invocations: Optional[list[InvocationData]] = None +class EvalStatus(str, Enum): + """Wire JSON uses the string values (stable protocol with agentevals CLI).""" + + PASSED = "PASSED" + FAILED = "FAILED" + NOT_EVALUATED = "NOT_EVALUATED" + + class EvalResult(BaseModel): """Output payload expected from a custom evaluator script/container on stdout.""" score: float = Field(ge=0.0, le=1.0) - status: Optional[str] = Field( + status: Optional[EvalStatus] = Field( default=None, - description='One of "PASSED", "FAILED", "NOT_EVALUATED". Derived from score vs threshold if omitted.', + description="One of EvalStatus.PASSED, EvalStatus.FAILED, EvalStatus.NOT_EVALUATED. Derived from score vs threshold if omitted.", ) per_invocation_scores: list[Optional[float]] = Field(default_factory=list) details: Optional[dict[str, Any]] = None diff --git a/src/agentevals/_protocol.py b/src/agentevals/_protocol.py index add1484..4508490 100644 --- a/src/agentevals/_protocol.py +++ b/src/agentevals/_protocol.py @@ -14,6 +14,7 @@ from __future__ import annotations +from enum import Enum from typing import Any, Optional from pydantic import BaseModel, Field @@ -62,13 +63,21 @@ class EvalInput(BaseModel): expected_invocations: Optional[list[InvocationData]] = None +class EvalStatus(str, Enum): + """Allowed ``status`` values on the evaluator JSON wire format (matches evaluator-sdk).""" + + PASSED = "PASSED" + FAILED = "FAILED" + NOT_EVALUATED = "NOT_EVALUATED" + + class EvalResult(BaseModel): """Output payload expected from a custom evaluator on stdout.""" score: float = Field(ge=0.0, le=1.0) - status: Optional[str] = Field( + status: Optional[EvalStatus] = Field( default=None, - description='One of "PASSED", "FAILED", "NOT_EVALUATED". Derived from score vs threshold if omitted.', + description="Derived from score vs threshold if omitted.", ) per_invocation_scores: list[Optional[float]] = Field(default_factory=list) details: Optional[dict[str, Any]] = None diff --git a/tests/test_protocol.py b/tests/test_protocol.py new file mode 100644 index 0000000..8c0f14e --- /dev/null +++ b/tests/test_protocol.py @@ -0,0 +1,25 @@ +"""Tests for the custom evaluator JSON protocol models.""" + +import pytest +from pydantic import ValidationError + +from agentevals._protocol import EvalResult, EvalStatus + + +def test_eval_result_accepts_valid_status_strings() -> None: + raw = '{"score":1.0,"status":"PASSED","per_invocation_scores":[1.0]}' + r = EvalResult.model_validate_json(raw) + assert r.status == EvalStatus.PASSED + assert r.score == 1.0 + + +def test_eval_result_rejects_invalid_status() -> None: + raw = '{"score":1.0,"status":"MAYBE","per_invocation_scores":[]}' + with pytest.raises(ValidationError): + EvalResult.model_validate_json(raw) + + +def test_eval_result_omitted_status_ok() -> None: + raw = '{"score":0.5,"per_invocation_scores":[]}' + r = EvalResult.model_validate_json(raw) + assert r.status is None