Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def my_evaluator(input: EvalInput) -> EvalResult:
from .types import (
EvalInput,
EvalResult,
EvalStatus,
IntermediateStepData,
InvocationData,
ToolCallData,
Expand All @@ -34,6 +35,7 @@ def my_evaluator(input: EvalInput) -> EvalResult:
"evaluator",
"EvalInput",
"EvalResult",
"EvalStatus",
"IntermediateStepData",
"InvocationData",
"ToolCallData",
Expand Down
13 changes: 11 additions & 2 deletions packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from __future__ import annotations

from enum import Enum
from typing import Any, Optional

from pydantic import BaseModel, Field
Expand Down Expand Up @@ -63,13 +64,21 @@ class EvalInput(BaseModel):
expected_invocations: Optional[list[InvocationData]] = None


class EvalStatus(str, Enum):
"""Wire JSON uses the string values (stable protocol with agentevals CLI)."""

PASSED = "PASSED"
FAILED = "FAILED"
NOT_EVALUATED = "NOT_EVALUATED"


class EvalResult(BaseModel):
"""Output payload expected from a custom evaluator script/container on stdout."""

score: float = Field(ge=0.0, le=1.0)
status: Optional[str] = Field(
status: Optional[EvalStatus] = Field(
default=None,
description='One of "PASSED", "FAILED", "NOT_EVALUATED". Derived from score vs threshold if omitted.',
description="One of EvalStatus.PASSED, EvalStatus.FAILED, EvalStatus.NOT_EVALUATED. Derived from score vs threshold if omitted.",
)
per_invocation_scores: list[Optional[float]] = Field(default_factory=list)
details: Optional[dict[str, Any]] = None
13 changes: 11 additions & 2 deletions src/agentevals/_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from __future__ import annotations

from enum import Enum
from typing import Any, Optional

from pydantic import BaseModel, Field
Expand Down Expand Up @@ -62,13 +63,21 @@ class EvalInput(BaseModel):
expected_invocations: Optional[list[InvocationData]] = None


class EvalStatus(str, Enum):
"""Allowed ``status`` values on the evaluator JSON wire format (matches evaluator-sdk)."""

PASSED = "PASSED"
FAILED = "FAILED"
NOT_EVALUATED = "NOT_EVALUATED"


class EvalResult(BaseModel):
"""Output payload expected from a custom evaluator on stdout."""

score: float = Field(ge=0.0, le=1.0)
status: Optional[str] = Field(
status: Optional[EvalStatus] = Field(
default=None,
description='One of "PASSED", "FAILED", "NOT_EVALUATED". Derived from score vs threshold if omitted.',
description="Derived from score vs threshold if omitted.",
)
per_invocation_scores: list[Optional[float]] = Field(default_factory=list)
details: Optional[dict[str, Any]] = None
25 changes: 25 additions & 0 deletions tests/test_protocol.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
"""Tests for the custom evaluator JSON protocol models."""

import pytest
from pydantic import ValidationError

from agentevals._protocol import EvalResult, EvalStatus


def test_eval_result_accepts_valid_status_strings() -> None:
raw = '{"score":1.0,"status":"PASSED","per_invocation_scores":[1.0]}'
r = EvalResult.model_validate_json(raw)
assert r.status == EvalStatus.PASSED
assert r.score == 1.0


def test_eval_result_rejects_invalid_status() -> None:
raw = '{"score":1.0,"status":"MAYBE","per_invocation_scores":[]}'
with pytest.raises(ValidationError):
EvalResult.model_validate_json(raw)


def test_eval_result_omitted_status_ok() -> None:
raw = '{"score":0.5,"per_invocation_scores":[]}'
r = EvalResult.model_validate_json(raw)
assert r.status is None