From 9b54354841c730ec78292ae76da18b4d294c2c1b Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Sun, 22 Mar 2026 22:50:00 +0100 Subject: [PATCH 1/4] add EvalStatus enum Signed-off-by: Peter Jausovec --- .../src/agentevals_evaluator_sdk/__init__.py | 2 ++ .../src/agentevals_evaluator_sdk/types.py | 13 +++++++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/__init__.py b/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/__init__.py index 225e8a8..deec275 100644 --- a/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/__init__.py +++ b/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/__init__.py @@ -24,6 +24,7 @@ def my_evaluator(input: EvalInput) -> EvalResult: from .types import ( EvalInput, EvalResult, + EvalStatus, IntermediateStepData, InvocationData, ToolCallData, @@ -34,6 +35,7 @@ def my_evaluator(input: EvalInput) -> EvalResult: "evaluator", "EvalInput", "EvalResult", + "EvalStatus", "IntermediateStepData", "InvocationData", "ToolCallData", diff --git a/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py b/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py index 101026d..d136e46 100644 --- a/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py +++ b/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py @@ -7,6 +7,7 @@ from __future__ import annotations +from enum import Enum from typing import Any, Optional from pydantic import BaseModel, Field @@ -63,13 +64,21 @@ class EvalInput(BaseModel): expected_invocations: Optional[list[InvocationData]] = None +class EvalStatus(str, Enum): + """Wire JSON uses the string values (stable protocol with agentevals CLI).""" + + PASSED = "PASSED" + FAILED = "FAILED" + NOT_EVALUATED = "NOT_EVALUATED" + class EvalResult(BaseModel): """Output payload expected from a custom evaluator script/container on stdout.""" score: float = Field(ge=0.0, le=1.0) - status: Optional[str] = Field( + status: Optional[EvalStatus] = Field( default=None, - description='One of "PASSED", "FAILED", "NOT_EVALUATED". Derived from score vs threshold if omitted.', + description="One of EvalStatus.PASSED, EvalStatus.FAILED, EvalStatus.NOT_EVALUATED. Derived from score vs threshold if omitted.", + enum=EvalStatus, ) per_invocation_scores: list[Optional[float]] = Field(default_factory=list) details: Optional[dict[str, Any]] = None From 632039434cf27389fe72b7baaa42b3f16663a6e5 Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Sun, 22 Mar 2026 22:52:14 +0100 Subject: [PATCH 2/4] ruff Signed-off-by: Peter Jausovec --- packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py b/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py index d136e46..dbd3a45 100644 --- a/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py +++ b/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py @@ -71,6 +71,7 @@ class EvalStatus(str, Enum): FAILED = "FAILED" NOT_EVALUATED = "NOT_EVALUATED" + class EvalResult(BaseModel): """Output payload expected from a custom evaluator script/container on stdout.""" From 14e459bb40bcbc22aa735d398dec92c0c05ead97 Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Sun, 22 Mar 2026 23:19:14 +0100 Subject: [PATCH 3/4] update the protocol + add unit tests Signed-off-by: Peter Jausovec --- src/agentevals/_protocol.py | 13 +++++++++++-- tests/test_protocol.py | 25 +++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 2 deletions(-) create mode 100644 tests/test_protocol.py diff --git a/src/agentevals/_protocol.py b/src/agentevals/_protocol.py index add1484..4508490 100644 --- a/src/agentevals/_protocol.py +++ b/src/agentevals/_protocol.py @@ -14,6 +14,7 @@ from __future__ import annotations +from enum import Enum from typing import Any, Optional from pydantic import BaseModel, Field @@ -62,13 +63,21 @@ class EvalInput(BaseModel): expected_invocations: Optional[list[InvocationData]] = None +class EvalStatus(str, Enum): + """Allowed ``status`` values on the evaluator JSON wire format (matches evaluator-sdk).""" + + PASSED = "PASSED" + FAILED = "FAILED" + NOT_EVALUATED = "NOT_EVALUATED" + + class EvalResult(BaseModel): """Output payload expected from a custom evaluator on stdout.""" score: float = Field(ge=0.0, le=1.0) - status: Optional[str] = Field( + status: Optional[EvalStatus] = Field( default=None, - description='One of "PASSED", "FAILED", "NOT_EVALUATED". Derived from score vs threshold if omitted.', + description="Derived from score vs threshold if omitted.", ) per_invocation_scores: list[Optional[float]] = Field(default_factory=list) details: Optional[dict[str, Any]] = None diff --git a/tests/test_protocol.py b/tests/test_protocol.py new file mode 100644 index 0000000..8c0f14e --- /dev/null +++ b/tests/test_protocol.py @@ -0,0 +1,25 @@ +"""Tests for the custom evaluator JSON protocol models.""" + +import pytest +from pydantic import ValidationError + +from agentevals._protocol import EvalResult, EvalStatus + + +def test_eval_result_accepts_valid_status_strings() -> None: + raw = '{"score":1.0,"status":"PASSED","per_invocation_scores":[1.0]}' + r = EvalResult.model_validate_json(raw) + assert r.status == EvalStatus.PASSED + assert r.score == 1.0 + + +def test_eval_result_rejects_invalid_status() -> None: + raw = '{"score":1.0,"status":"MAYBE","per_invocation_scores":[]}' + with pytest.raises(ValidationError): + EvalResult.model_validate_json(raw) + + +def test_eval_result_omitted_status_ok() -> None: + raw = '{"score":0.5,"per_invocation_scores":[]}' + r = EvalResult.model_validate_json(raw) + assert r.status is None From 16028699cc44de0187e31afcf7e014cb54378b65 Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Mon, 23 Mar 2026 10:39:12 +0100 Subject: [PATCH 4/4] remove 'enum' from the field def Signed-off-by: Peter Jausovec --- packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py | 1 - 1 file changed, 1 deletion(-) diff --git a/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py b/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py index dbd3a45..51974b1 100644 --- a/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py +++ b/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py @@ -79,7 +79,6 @@ class EvalResult(BaseModel): status: Optional[EvalStatus] = Field( default=None, description="One of EvalStatus.PASSED, EvalStatus.FAILED, EvalStatus.NOT_EVALUATED. Derived from score vs threshold if omitted.", - enum=EvalStatus, ) per_invocation_scores: list[Optional[float]] = Field(default_factory=list) details: Optional[dict[str, Any]] = None