From 93c85f542b8d6fad2f1313444b96af2252f8be57 Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Wed, 25 Mar 2026 21:00:00 +0100 Subject: [PATCH 01/60] feat(core): eval-aware types, events, and DI fixes - EvalPayload, EvalScoreEntry on TestResult for eval case results - EVAL_SUITE_END event emitted by core runner - is_eval flag on TestRegistration/TestItem - KindFilterPlugin for protest run vs protest eval - get_type_hints_compat: PEP 563 + TYPE_CHECKING support in all DI sites - Async fixture teardown on same event loop (no more loop mismatch) - Fixture resolution time excluded from test duration - Log records captured on TestResult for --show-logs --- protest/core/collector.py | 11 +- protest/core/execution/test_executor.py | 43 +++++-- protest/core/outcome.py | 146 +++++++++--------------- protest/di/container.py | 34 ++---- protest/di/hints.py | 62 ++++++++++ protest/di/validation.py | 9 +- protest/entities/events.py | 30 ++++- protest/events/types.py | 2 + protest/execution/capture.py | 16 +++ protest/plugin.py | 6 + 10 files changed, 222 insertions(+), 137 deletions(-) create mode 100644 protest/di/hints.py diff --git a/protest/core/collector.py b/protest/core/collector.py index 74dd75d..24356a8 100644 --- a/protest/core/collector.py +++ b/protest/core/collector.py @@ -2,7 +2,7 @@ from inspect import signature from itertools import groupby, product -from typing import TYPE_CHECKING, Annotated, Any, get_args, get_origin, get_type_hints +from typing import TYPE_CHECKING, Annotated, Any, get_args, get_origin from protest.di.decorators import get_fixture_marker, unwrap_fixture from protest.di.markers import Use @@ -18,10 +18,9 @@ def _extract_use_fixtures(func: Callable[..., Any]) -> list[FixtureCallable]: """Extract fixtures referenced via Use() markers in function parameters.""" - try: - type_hints = get_type_hints(func, include_extras=True) - except Exception: - type_hints = {} + from protest.di.hints import get_type_hints_compat + + type_hints = get_type_hints_compat(func) fixtures: list[FixtureCallable] = [] for param_name in signature(func).parameters: @@ -164,6 +163,7 @@ def _expand_registration( xfail=test_reg.xfail, timeout=test_reg.timeout, retry=test_reg.retry, + is_eval=test_reg.is_eval, ) ] @@ -188,6 +188,7 @@ def _expand_registration( xfail=test_reg.xfail, timeout=test_reg.timeout, retry=test_reg.retry, + is_eval=test_reg.is_eval, ) ) diff --git a/protest/core/execution/test_executor.py b/protest/core/execution/test_executor.py index 8fa92a3..8b475c6 100644 --- a/protest/core/execution/test_executor.py +++ b/protest/core/execution/test_executor.py @@ -7,7 +7,7 @@ import time from contextlib import AsyncExitStack, asynccontextmanager from inspect import signature -from typing import TYPE_CHECKING, Any, get_type_hints +from typing import TYPE_CHECKING, Any from protest.core.collector import get_transitive_fixtures from protest.core.outcome import OutcomeBuilder, TestExecutionResult @@ -20,11 +20,13 @@ TestStartInfo, TestTeardownInfo, ) +from protest.entities.events import EvalPayload from protest.events.types import Event from protest.exceptions import FixtureError from protest.execution.async_bridge import ensure_async from protest.execution.capture import ( CaptureCurrentTest, + get_current_log_records, reset_current_node_id, set_current_node_id, ) @@ -112,8 +114,6 @@ async def _run_test( # noqa: PLR0912 - complex test execution flow, refactoring ) ) - start = time.perf_counter() - try: kwargs = await self._resolve_test_kwargs(item, ctx) except Exception as exc: @@ -122,13 +122,15 @@ async def _run_test( # noqa: PLR0912 - complex test execution flow, refactoring test_name=test_name, node_id=node_id, suite_path=item.suite_path, - duration=time.perf_counter() - start, + duration=0, output=buffer.getvalue(), error=exc, is_fixture_error=True, ) ) + start = time.perf_counter() + # Conditional skip (callable) - evaluated AFTER fixture resolution if item.skip and item.skip.is_conditional: try: @@ -162,26 +164,33 @@ async def _run_test( # noqa: PLR0912 - complex test execution flow, refactoring previous_errors: list[Exception] = [] error: Exception | None = None is_fixture_error = False + eval_payload: EvalPayload | None = None attempt = 1 # Initialized here; always overwritten by loop for attempt in range(1, max_attempts + 1): error = None is_fixture_error = False + eval_payload = None try: if item.timeout is not None: try: - await asyncio.wait_for( + return_value = await asyncio.wait_for( ensure_async(item.func, **kwargs), timeout=item.timeout, ) except asyncio.TimeoutError: - # Only wrap timeout from wait_for, not from test code raise asyncio.TimeoutError( f"Test exceeded timeout of {item.timeout}s" ) from None else: - await ensure_async(item.func, **kwargs) + return_value = await ensure_async(item.func, **kwargs) + + # For eval items: capture EvalPayload and determine pass/fail + if item.is_eval and isinstance(return_value, EvalPayload): + eval_payload = return_value + if not eval_payload.passed: + error = _build_eval_error(eval_payload) except FixtureError as exc: error = exc.original is_fixture_error = True @@ -231,6 +240,9 @@ async def _run_test( # noqa: PLR0912 - complex test execution flow, refactoring attempt=attempt, max_attempts=max_attempts, previous_errors=tuple(previous_errors), + is_eval=item.is_eval, + eval_payload=eval_payload, + log_records=tuple(get_current_log_records()), ) ) @@ -243,10 +255,9 @@ async def _resolve_test_kwargs( func_signature = signature(item.func) kwargs: dict[str, Any] = dict(item.case_kwargs) - try: - type_hints = get_type_hints(item.func, include_extras=True) - except Exception: - type_hints = {} + from protest.di.hints import get_type_hints_compat + + type_hints = get_type_hints_compat(item.func) for param_name, param in func_signature.parameters.items(): if param_name in kwargs: @@ -346,3 +357,13 @@ async def _acquire_fixture_semaphores( for _, sem in sems_sorted: await stack.enter_async_context(_semaphore_context(sem)) yield + + +def _build_eval_error(payload: EvalPayload) -> AssertionError: + """Build a descriptive AssertionError from failed eval scores.""" + failed = [ + f"{name}={entry.value}" + for name, entry in payload.scores.items() + if not entry.passed + ] + return AssertionError(f"{', '.join(failed)}") diff --git a/protest/core/outcome.py b/protest/core/outcome.py index b89a7bb..0018812 100644 --- a/protest/core/outcome.py +++ b/protest/core/outcome.py @@ -1,11 +1,17 @@ """Test outcome classification and building.""" +from __future__ import annotations + from dataclasses import dataclass from enum import Enum, auto +from typing import TYPE_CHECKING, Any from protest.entities import SuitePath, TestCounts, TestOutcome, TestResult from protest.events.types import Event +if TYPE_CHECKING: + from protest.entities.events import EvalPayload + class OutcomeType(Enum): """Classification of test execution outcomes.""" @@ -35,13 +41,16 @@ class TestExecutionResult: attempt: int = 1 max_attempts: int = 1 previous_errors: tuple[Exception, ...] = () + is_eval: bool = False + eval_payload: EvalPayload | None = None + log_records: tuple[Any, ...] = () class OutcomeBuilder: """Builds TestOutcome from test execution results.""" def build(self, exec_result: TestExecutionResult) -> TestOutcome: - """Build a TestOutcome from execution result by classifying and constructing.""" + """Build a TestOutcome from execution result.""" outcome_type = self._classify(exec_result) match outcome_type: @@ -59,7 +68,6 @@ def build(self, exec_result: TestExecutionResult) -> TestOutcome: return self._build_fail(exec_result) def _classify(self, exec_result: TestExecutionResult) -> OutcomeType: - """Classify execution result into outcome type.""" match ( exec_result.skip_reason, exec_result.error, @@ -79,91 +87,49 @@ def _classify(self, exec_result: TestExecutionResult) -> OutcomeType: case _: return OutcomeType.FAIL - def _build_skip(self, exec_result: TestExecutionResult) -> TestOutcome: - result = TestResult( - name=exec_result.test_name, - node_id=exec_result.node_id, - suite_path=exec_result.suite_path, - skip_reason=exec_result.skip_reason, - timeout=exec_result.timeout, - attempt=exec_result.attempt, - max_attempts=exec_result.max_attempts, - previous_errors=exec_result.previous_errors, - ) - return TestOutcome(result, TestCounts(skipped=1), Event.TEST_SKIP) - - def _build_pass(self, exec_result: TestExecutionResult) -> TestOutcome: - result = TestResult( - name=exec_result.test_name, - node_id=exec_result.node_id, - suite_path=exec_result.suite_path, - duration=exec_result.duration, - output=exec_result.output, - timeout=exec_result.timeout, - attempt=exec_result.attempt, - max_attempts=exec_result.max_attempts, - previous_errors=exec_result.previous_errors, - ) - return TestOutcome(result, TestCounts(passed=1), Event.TEST_PASS) - - def _build_xpass(self, exec_result: TestExecutionResult) -> TestOutcome: - result = TestResult( - name=exec_result.test_name, - node_id=exec_result.node_id, - suite_path=exec_result.suite_path, - duration=exec_result.duration, - output=exec_result.output, - xfail_reason=exec_result.xfail_reason, - timeout=exec_result.timeout, - attempt=exec_result.attempt, - max_attempts=exec_result.max_attempts, - previous_errors=exec_result.previous_errors, - ) - return TestOutcome(result, TestCounts(xpassed=1), Event.TEST_XPASS) - - def _build_error(self, exec_result: TestExecutionResult) -> TestOutcome: - result = TestResult( - name=exec_result.test_name, - node_id=exec_result.node_id, - suite_path=exec_result.suite_path, - error=exec_result.error, - duration=exec_result.duration, - output=exec_result.output, - is_fixture_error=True, - timeout=exec_result.timeout, - attempt=exec_result.attempt, - max_attempts=exec_result.max_attempts, - previous_errors=exec_result.previous_errors, - ) - return TestOutcome(result, TestCounts(errored=1), Event.TEST_FAIL) - - def _build_xfail(self, exec_result: TestExecutionResult) -> TestOutcome: - result = TestResult( - name=exec_result.test_name, - node_id=exec_result.node_id, - suite_path=exec_result.suite_path, - error=exec_result.error, - duration=exec_result.duration, - output=exec_result.output, - xfail_reason=exec_result.xfail_reason, - timeout=exec_result.timeout, - attempt=exec_result.attempt, - max_attempts=exec_result.max_attempts, - previous_errors=exec_result.previous_errors, - ) - return TestOutcome(result, TestCounts(xfailed=1), Event.TEST_XFAIL) - - def _build_fail(self, exec_result: TestExecutionResult) -> TestOutcome: - result = TestResult( - name=exec_result.test_name, - node_id=exec_result.node_id, - suite_path=exec_result.suite_path, - error=exec_result.error, - duration=exec_result.duration, - output=exec_result.output, - timeout=exec_result.timeout, - attempt=exec_result.attempt, - max_attempts=exec_result.max_attempts, - previous_errors=exec_result.previous_errors, - ) - return TestOutcome(result, TestCounts(failed=1), Event.TEST_FAIL) + def _base_kwargs(self, er: TestExecutionResult) -> dict[str, object]: + """Common TestResult kwargs from an execution result.""" + return { + "name": er.test_name, + "node_id": er.node_id, + "suite_path": er.suite_path, + "duration": er.duration, + "output": er.output, + "timeout": er.timeout, + "attempt": er.attempt, + "max_attempts": er.max_attempts, + "previous_errors": er.previous_errors, + "is_eval": er.is_eval, + "eval_payload": er.eval_payload, + "log_records": er.log_records, + } + + def _build_skip(self, er: TestExecutionResult) -> TestOutcome: + kw = self._base_kwargs(er) + kw.update(duration=0, output="", skip_reason=er.skip_reason) + return TestOutcome(TestResult(**kw), TestCounts(skipped=1), Event.TEST_SKIP) # type: ignore[arg-type] + + def _build_pass(self, er: TestExecutionResult) -> TestOutcome: + return TestOutcome( + TestResult(**self._base_kwargs(er)), TestCounts(passed=1), Event.TEST_PASS + ) # type: ignore[arg-type] + + def _build_xpass(self, er: TestExecutionResult) -> TestOutcome: + kw = self._base_kwargs(er) + kw["xfail_reason"] = er.xfail_reason + return TestOutcome(TestResult(**kw), TestCounts(xpassed=1), Event.TEST_XPASS) # type: ignore[arg-type] + + def _build_error(self, er: TestExecutionResult) -> TestOutcome: + kw = self._base_kwargs(er) + kw.update(error=er.error, is_fixture_error=True) + return TestOutcome(TestResult(**kw), TestCounts(errored=1), Event.TEST_FAIL) # type: ignore[arg-type] + + def _build_xfail(self, er: TestExecutionResult) -> TestOutcome: + kw = self._base_kwargs(er) + kw.update(error=er.error, xfail_reason=er.xfail_reason) + return TestOutcome(TestResult(**kw), TestCounts(xfailed=1), Event.TEST_XFAIL) # type: ignore[arg-type] + + def _build_fail(self, er: TestExecutionResult) -> TestOutcome: + kw = self._base_kwargs(er) + kw["error"] = er.error + return TestOutcome(TestResult(**kw), TestCounts(failed=1), Event.TEST_FAIL) # type: ignore[arg-type] diff --git a/protest/di/container.py b/protest/di/container.py index 8ab6e49..5c38571 100644 --- a/protest/di/container.py +++ b/protest/di/container.py @@ -11,7 +11,6 @@ Any, get_args, get_origin, - get_type_hints, overload, ) @@ -741,8 +740,9 @@ async def _run_teardown_interruptible( """Run exit stack teardown, interruptible by cancellation event. Returns True if cancelled (should abort), False if completed normally. - Teardown runs in a thread pool so sync blocking code doesn't freeze - the event loop, allowing us to detect and respond to cancellation. + Teardown runs on the SAME event loop as fixture setup — creating a + new loop would break async resources (drivers, connections) that hold + references to the original loop. """ if interrupt_event is None: await exit_stack.__aexit__(exc_type, exc_val, exc_tb) @@ -751,23 +751,10 @@ async def _run_teardown_interruptible( if interrupt_event.is_set(): return True - # Run teardown in thread pool so sync code doesn't block event loop - loop = asyncio.get_running_loop() - - def run_sync_teardown() -> None: - # Create a new event loop for the thread to run async teardowns - new_loop = asyncio.new_event_loop() - try: - new_loop.run_until_complete( - exit_stack.__aexit__(exc_type, exc_val, exc_tb) - ) - finally: - new_loop.close() - - async def run_in_thread() -> None: - await loop.run_in_executor(None, run_sync_teardown) - - teardown_task = asyncio.create_task(run_in_thread()) + # Run teardown on the same loop, race with cancellation + teardown_task = asyncio.create_task( + exit_stack.__aexit__(exc_type, exc_val, exc_tb) + ) wait_cancel = asyncio.create_task(interrupt_event.wait()) done, _ = await asyncio.wait( @@ -793,10 +780,9 @@ def _analyze_and_store_dependencies( actual_func = unwrap_fixture(func) func_signature = signature(actual_func) - try: - type_hints = get_type_hints(actual_func, include_extras=True) - except Exception: - type_hints = {} + from protest.di.hints import get_type_hints_compat + + type_hints = get_type_hints_compat(actual_func) dependencies: dict[str, FixtureCallable] = {} for param_name, param in func_signature.parameters.items(): diff --git a/protest/di/hints.py b/protest/di/hints.py new file mode 100644 index 0000000..ede4c12 --- /dev/null +++ b/protest/di/hints.py @@ -0,0 +1,62 @@ +"""Type hints resolution with PEP 563 / TYPE_CHECKING compatibility. + +Shared by the core DI system and evals runner. Handles two failure modes: + +1. Local fixtures — ``from __future__ import annotations`` stringifies + annotations; names defined in local scopes aren't in ``func.__globals__``. + Fix: collect locals from the call stack. + +2. TYPE_CHECKING-only types — e.g. ``AsyncDriver`` imported only under + ``if TYPE_CHECKING:``. Fix: substitute ``Any`` for each unresolvable + name. The type itself is irrelevant for DI; only the ``Use(...)`` + marker inside ``Annotated[...]`` matters. +""" + +from __future__ import annotations + +import inspect +import re +from typing import Any, get_type_hints + + +def get_type_hints_compat(func: Any) -> dict[str, Any]: + """Resolve type hints with PEP 563 / TYPE_CHECKING fallbacks.""" + try: + return get_type_hints(func, include_extras=True) + except Exception: + pass + + # Build a namespace from the entire call stack (covers local fixtures). + localns: dict[str, Any] = {} + try: + for frame_info in inspect.stack(): + localns.update(frame_info.frame.f_locals) + except Exception: + pass + + try: + return get_type_hints(func, localns=localns, include_extras=True) + except Exception: + pass + + # TYPE_CHECKING fallback: substitute Any for unresolvable names. + return _get_type_hints_substituting_any(func, localns) + + +def _get_type_hints_substituting_any( + func: Any, + localns: dict[str, Any], +) -> dict[str, Any]: + """Retry get_type_hints, replacing each NameError'd name with Any.""" + localns = dict(localns) + for _ in range(20): + try: + return get_type_hints(func, localns=localns, include_extras=True) + except NameError as exc: + match = re.search(r"name '(\w+)' is not defined", str(exc)) + if not match: + break + localns[match.group(1)] = Any + except Exception: + break + return {} diff --git a/protest/di/validation.py b/protest/di/validation.py index 2d6cd18..d716397 100644 --- a/protest/di/validation.py +++ b/protest/di/validation.py @@ -3,7 +3,7 @@ from __future__ import annotations from inspect import signature -from typing import TYPE_CHECKING, Annotated, Any, get_args, get_origin, get_type_hints +from typing import TYPE_CHECKING, Annotated, Any, get_args, get_origin from protest.di.markers import ForEach, From from protest.exceptions import ParameterizedFixtureError @@ -15,10 +15,9 @@ def _extract_from_params(func: Callable[..., Any]) -> dict[str, ForEach[Any]]: """Extract parameters annotated with From(source).""" - try: - type_hints = get_type_hints(func, include_extras=True) - except Exception: - type_hints = {} + from protest.di.hints import get_type_hints_compat + + type_hints = get_type_hints_compat(func) result: dict[str, ForEach[Any]] = {} for param_name in signature(func).parameters: diff --git a/protest/entities/events.py b/protest/entities/events.py index f87d9d9..d76434c 100644 --- a/protest/entities/events.py +++ b/protest/entities/events.py @@ -1,13 +1,36 @@ from __future__ import annotations -from dataclasses import dataclass -from typing import TYPE_CHECKING +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Any if TYPE_CHECKING: from protest.entities import FixtureScope, SuitePath from protest.events.types import Event +@dataclass(frozen=True, slots=True) +class EvalScoreEntry: + """A single score entry from an evaluator.""" + + value: float | bool | str + passed: bool = True + + +@dataclass(frozen=True, slots=True) +class EvalPayload: + """Structured payload for eval results, carried on TestResult.""" + + case_name: str + passed: bool + task_duration: float + inputs: Any = None + output: Any = None + expected_output: Any = None + scores: dict[str, EvalScoreEntry] = field(default_factory=dict) + case_hash: str = "" + eval_hash: str = "" + + @dataclass(frozen=True, slots=True) class TestCounts: passed: int = 0 @@ -43,6 +66,9 @@ class TestResult: attempt: int = 1 max_attempts: int = 1 previous_errors: tuple[Exception, ...] = () + is_eval: bool = False + eval_payload: EvalPayload | None = None + log_records: tuple[Any, ...] = () @dataclass(frozen=True, slots=True) diff --git a/protest/events/types.py b/protest/events/types.py index 8f4d1fc..05d9fa2 100644 --- a/protest/events/types.py +++ b/protest/events/types.py @@ -16,6 +16,7 @@ class Event(Enum): SUITE_SETUP_DONE = "suite_setup_done" SUITE_TEARDOWN_START = "suite_teardown_start" SUITE_END = "suite_end" + EVAL_SUITE_END = "eval_suite_end" TEST_START = "test_start" TEST_ACQUIRED = "test_acquired" TEST_SETUP_DONE = "test_setup_done" @@ -34,3 +35,4 @@ class Event(Enum): FIXTURE_TEARDOWN_START = "fixture_teardown_start" FIXTURE_TEARDOWN_DONE = "fixture_teardown_done" SESSION_INTERRUPTED = "session_interrupted" + USER_PRINT = "user_print" diff --git a/protest/execution/capture.py b/protest/execution/capture.py index d05fe00..2e258a7 100644 --- a/protest/execution/capture.py +++ b/protest/execution/capture.py @@ -19,6 +19,7 @@ ) _current_node_id: ContextVar[str | None] = ContextVar("current_node_id", default=None) +_event_bus_ref: ContextVar[object | None] = ContextVar("event_bus_ref", default=None) @dataclass(slots=True) @@ -100,6 +101,21 @@ def get_session_teardown_output() -> str: return _session_teardown.buffer.getvalue() if _session_teardown.buffer else "" +def set_event_bus(bus: object) -> Token[object | None]: + """Set event bus reference for console.print() access.""" + return _event_bus_ref.set(bus) + + +def reset_event_bus(token: Token[object | None]) -> None: + """Reset event bus reference.""" + _event_bus_ref.reset(token) + + +def get_event_bus() -> object | None: + """Get current event bus (for console.print).""" + return _event_bus_ref.get() + + class TaskAwareStream: def __init__(self, original_stream: TextIO, show_output: bool = False) -> None: self._original = original_stream diff --git a/protest/plugin.py b/protest/plugin.py index 6833b03..9589fff 100644 --- a/protest/plugin.py +++ b/protest/plugin.py @@ -142,6 +142,12 @@ def on_suite_teardown_start(self, path: SuitePath) -> None | Awaitable[None]: def on_suite_end(self, result: SuiteResult) -> None | Awaitable[None]: """Suite ends (after fixture teardown).""" + def on_eval_suite_end(self, report: Any) -> None | Awaitable[None]: + """Eval suite finished — aggregated report with scores/stats.""" + + def on_user_print(self, data: Any) -> None | Awaitable[None]: + """User-initiated print via protest.console.print().""" + # ───────────────────────────────────────────────────────────────────── # Fixture lifecycle # ───────────────────────────────────────────────────────────────────── From 5041457c4561f137c797a97563467db34c701ef3 Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Thu, 26 Mar 2026 20:00:00 +0100 Subject: [PATCH 02/60] feat(evals): native eval system with @session.eval() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An eval is a test that returns a scored value. Uses ForEach/From for parametrization — no separate EvalSuite/EvalCase framework. - @session.eval(evaluators=[...]) decorator - @evaluator decorator with partial-application binding - EvalSession(model=) for eval-focused sessions - EvalContext passed to evaluators - Scoring v2: evaluators return bool or dataclass - Annotated[bool, Verdict] → pass/fail - Annotated[float, Metric] → stats aggregation - Annotated[str, Reason] → displayed on failure - EvalCase dataclass for typed ForEach data - Built-in evaluators: contains_keywords, not_empty, max_length, etc. - EvalHistoryPlugin listens to EVAL_SUITE_END - EvalResultsWriter for per-case .md files - Evaluator exception → error (not fail) --- protest/core/runner.py | 76 ++++++++++++++ protest/core/session.py | 115 +++++++++++++++++++-- protest/core/suite.py | 39 ++++++- protest/evals/__init__.py | 28 +++++ protest/evals/evaluator.py | 176 ++++++++++++++++++++++++++++++++ protest/evals/evaluators.py | 143 ++++++++++++++++++++++++++ protest/evals/hashing.py | 51 +++++++++ protest/evals/history.py | 159 +++++++++++++++++++++++++++++ protest/evals/results_writer.py | 155 ++++++++++++++++++++++++++++ protest/evals/session.py | 44 ++++++++ protest/evals/types.py | 168 ++++++++++++++++++++++++++++++ protest/evals/wrapper.py | 176 ++++++++++++++++++++++++++++++++ protest/filters/kind.py | 36 +++++++ 13 files changed, 1358 insertions(+), 8 deletions(-) create mode 100644 protest/evals/__init__.py create mode 100644 protest/evals/evaluator.py create mode 100644 protest/evals/evaluators.py create mode 100644 protest/evals/hashing.py create mode 100644 protest/evals/history.py create mode 100644 protest/evals/results_writer.py create mode 100644 protest/evals/session.py create mode 100644 protest/evals/types.py create mode 100644 protest/evals/wrapper.py create mode 100644 protest/filters/kind.py diff --git a/protest/core/runner.py b/protest/core/runner.py index 0347c2d..70669d0 100644 --- a/protest/core/runner.py +++ b/protest/core/runner.py @@ -1,7 +1,10 @@ """Test runner orchestration.""" +from __future__ import annotations + import asyncio import time +from typing import TYPE_CHECKING, Any from protest.core.collector import Collector from protest.core.execution import ParallelExecutor, SuiteManager, TestExecutor @@ -22,6 +25,10 @@ from protest.execution.context import cancellation_event from protest.execution.interrupt import InterruptHandler +if TYPE_CHECKING: + from protest.entities.events import TestResult + from protest.evals.types import EvalCaseResult + class TestRunner: """Executes tests with parallel support and fixture lifecycle management. @@ -36,6 +43,7 @@ def __init__(self, session: ProTestSession) -> None: self._interrupt_handler = InterruptHandler() self._interrupted = False self._force_interrupt_emitted = False + self._eval_results: dict[str, list[EvalCaseResult]] = {} # Extracted components self._suite_manager = SuiteManager(session) @@ -61,10 +69,23 @@ def run(self) -> RunResult: self._interrupt_handler.uninstall() loop.close() + def _collect_eval_result(self, result: TestResult) -> None: + """Internal handler: collect eval results from TEST_PASS/FAIL events.""" + if not result.is_eval or result.eval_payload is None: + return + suite_name = result.suite_path.root_name if result.suite_path else "evals" + case_result = _build_eval_case_result(result) + self._eval_results.setdefault(suite_name, []).append(case_result) + async def _main_loop(self) -> bool: """The main async loop for running tests.""" session_start = time.perf_counter() + # Register internal eval collector before tests run + self._eval_results.clear() + self._session.events.on(Event.TEST_PASS, self._collect_eval_result) + self._session.events.on(Event.TEST_FAIL, self._collect_eval_result) + collector = Collector() items = collector.collect(self._session) @@ -79,9 +100,12 @@ async def _main_loop(self) -> bool: total_counts = TestCounts() # Inject cancellation event into context for teardown awareness + from protest.execution.capture import reset_event_bus, set_event_bus + cancel_token = cancellation_event.set( self._interrupt_handler.force_teardown_event ) + bus_token = set_event_bus(self._session.events) try: with GlobalCapturePatch(show_output=not self._session.capture): async with self._session: @@ -112,6 +136,8 @@ async def _main_loop(self) -> bool: ): suite_result = self._suite_manager.build_result(suite_path) await self._session.events.emit(Event.SUITE_END, suite_result) + # Emit EVAL_SUITE_END for eval suites + await self._emit_eval_suite_end(suite_path) await self._session.events.emit(Event.SESSION_TEARDOWN_START) finally: @@ -124,6 +150,7 @@ async def _main_loop(self) -> bool: await self._session.events.emit(Event.SESSION_INTERRUPTED, True) self._force_interrupt_emitted = True cancellation_event.reset(cancel_token) + reset_event_bus(bus_token) if self._interrupt_handler.should_stop_new_tests: self._interrupted = True @@ -151,8 +178,57 @@ async def _main_loop(self) -> bool: await self._session.events.wait_pending() await self._session.events.emit(Event.SESSION_COMPLETE, session_result) + # Unregister eval collector + self._session.events.off(Event.TEST_PASS, self._collect_eval_result) + self._session.events.off(Event.TEST_FAIL, self._collect_eval_result) + return ( total_counts.failed == 0 and total_counts.errored == 0 and total_counts.xpassed == 0 ) + + async def _emit_eval_suite_end(self, suite_path: Any) -> None: + """Emit EVAL_SUITE_END if this suite_path corresponds to an eval suite.""" + from protest.evals.types import EvalSuiteReport + + suite_name = ( + suite_path.root_name + if hasattr(suite_path, "root_name") + else str(suite_path) + ) + eval_cases = self._eval_results.get(suite_name) + if not eval_cases: + return + report = EvalSuiteReport( + suite_name=suite_name, + cases=tuple(eval_cases), + duration=sum(c.duration for c in eval_cases), + ) + await self._session.events.emit(Event.EVAL_SUITE_END, report) + + +def _build_eval_case_result(result: TestResult) -> EvalCaseResult: + """Build EvalCaseResult from a TestResult with eval_payload.""" + from protest.evals.types import EvalCaseResult, EvalScore + + payload = result.eval_payload + assert payload is not None + return EvalCaseResult( + case_name=payload.case_name or "", + node_id=result.node_id, + scores=tuple( + EvalScore( + name=name, + value=entry.value, + ) + for name, entry in payload.scores.items() + ), + duration=payload.task_duration, + passed=not (result.error is not None or not payload.passed), + inputs=payload.inputs, + output=payload.output, + expected_output=payload.expected_output, + case_hash=payload.case_hash, + eval_hash=payload.eval_hash, + ) diff --git a/protest/core/session.py b/protest/core/session.py index 778dbb3..3224028 100644 --- a/protest/core/session.py +++ b/protest/core/session.py @@ -1,14 +1,16 @@ from __future__ import annotations -from typing import TYPE_CHECKING, TypeVar +from typing import TYPE_CHECKING, Any, TypeVar if TYPE_CHECKING: from collections.abc import Callable + from pathlib import Path from types import TracebackType from protest.compat import Self from protest.core.suite import ProTestSuite from protest.entities import FixtureCallable + from protest.evals.types import JudgeInfo, ModelInfo from protest.plugin import PluginBase, PluginContext from protest.cache.plugin import CachePlugin @@ -31,6 +33,7 @@ from protest.exceptions import InvalidMaxConcurrencyError from protest.execution.capture import set_session_teardown_capture from protest.filters.keyword import KeywordFilterPlugin +from protest.filters.kind import KindFilterPlugin from protest.filters.suite import SuiteFilterPlugin from protest.reporting.ascii import AsciiReporter from protest.reporting.ctrf import CTRFReporter @@ -54,7 +57,13 @@ class ProTestSession: concurrency: Number of parallel test workers (default: 1). """ - def __init__(self, concurrency: int = 1) -> None: + def __init__( + self, + concurrency: int = 1, + history: bool = False, + history_dir: Path | None = None, + metadata: dict[str, Any] | None = None, + ) -> None: if concurrency < 1: raise InvalidMaxConcurrencyError(concurrency) @@ -72,6 +81,11 @@ def __init__(self, concurrency: int = 1) -> None: self._capture: bool = True self._setup_duration: float = 0 self._teardown_duration: float = 0 + self._history = history + self._history_dir = history_dir + self._metadata: dict[str, Any] = dict(metadata) if metadata else {} + self._eval_model: ModelInfo | None = None # set by EvalSession + self._eval_judge: JudgeInfo | None = None # set by EvalSession async def resolve_autouse(self) -> None: """Resolve all session autouse fixtures at session start.""" @@ -104,6 +118,18 @@ def capture(self) -> bool: def capture(self, value: bool) -> None: self._capture = value + @property + def history(self) -> bool: + return self._history + + @property + def history_dir(self) -> Path | None: + return self._history_dir + + @property + def metadata(self) -> dict[str, Any]: + return self._metadata + @property def setup_duration(self) -> float: """Duration of session setup (available after resolve_autouse).""" @@ -151,6 +177,7 @@ def test( skip_reason: str = "Skipped", xfail: bool | str | Xfail | None = None, retry: int | Retry | None = None, + is_eval: bool = False, ) -> Callable[[FuncT], FuncT]: def decorator(func: FuncT) -> FuncT: if timeout is not None and timeout < 0: @@ -168,21 +195,64 @@ def decorator(func: FuncT) -> FuncT: xfail=norm_xfail, timeout=timeout, retry=norm_retry, + is_eval=is_eval, ) ) return func return decorator + def eval( + self, + evaluators: list[Any] | None = None, + expected_key: str = "expected", + tags: list[str] | None = None, + timeout: float | None = None, + name: str | None = None, + model: Any = None, + ) -> Callable[[FuncT], FuncT]: + """Register a scored eval test. + + Creates an implicit eval suite named after the function. + The decorated function's return value is passed to evaluators. + Use with ForEach/From for parametrization:: + + @session.eval(evaluators=[my_scorer], model=ModelInfo(name="qwen")) + async def my_eval(case: Annotated[dict, From(cases)]) -> str: + return await run(case["q"]) + """ + from protest.core.suite import ProTestSuite + from protest.evals.wrapper import make_eval_wrapper + + def decorator(func: FuncT) -> FuncT: + suite_name = name or func.__name__ + suite_meta: dict[str, Any] = {} + resolved_model = model or getattr(self, "_eval_model", None) + if resolved_model: + suite_meta["model"] = resolved_model.name + suite_meta["provider"] = getattr(resolved_model, "provider", None) + suite = ProTestSuite( + name=suite_name, + tags=list(tags or []), + kind="eval", + metadata=suite_meta, + ) + wrapper = make_eval_wrapper( + func, + evaluators or [], + expected_key, + ) + suite.test(tags=tags, timeout=timeout, is_eval=True)(wrapper) + self.add_suite(suite) + return func + + return decorator + def add_suite(self, suite: ProTestSuite) -> None: """Add a suite to this session.""" suite._attach_to_session(self) self._suites.append(suite) - def include_suite(self, suite: ProTestSuite) -> None: - """Alias for add_suite (backward compatibility).""" - self.add_suite(suite) - def bind( self, fn: FixtureCallable, @@ -246,6 +316,7 @@ def default_plugin_classes() -> list[type[PluginBase]]: TagFilterPlugin, SuiteFilterPlugin, KeywordFilterPlugin, + KindFilterPlugin, RichReporter, AsciiReporter, CTRFReporter, @@ -256,6 +327,10 @@ def register_default_plugins(self) -> None: """Register all standard ProTest plugins for CLI discovery.""" for plugin_class in self.default_plugin_classes(): self.use(plugin_class) + if self._history: + from protest.history.plugin import HistoryPlugin + + self.register_plugin(HistoryPlugin(history_dir=self._history_dir)) @property def plugin_classes(self) -> list[type[PluginBase]]: @@ -294,6 +369,34 @@ def activate_plugins(self, ctx: PluginContext) -> None: if instance is not None: self.register_plugin(instance) + # Auto-wire eval support if any suite has kind="eval" + if any(s.kind == "eval" for s in self._suites): + self._wire_eval_support() + + def _wire_eval_support(self) -> None: + """Wire eval history + results writer plugins (no EvalPlugin).""" + from protest.evals.history import EvalHistoryPlugin + from protest.evals.results_writer import EvalResultsWriter + + judge_dict = None + if self._eval_judge: + judge_dict = { + "name": self._eval_judge.name, + "provider": getattr(self._eval_judge, "provider", None), + "evaluators": list(getattr(self._eval_judge, "evaluators", ())), + } + + history = EvalHistoryPlugin( + history_dir=self._history_dir, + model=self._eval_model, + judge=judge_dict, + metadata=self._metadata, + ) + self.register_plugin(history) + + writer = EvalResultsWriter(history_dir=self._history_dir) + self.register_plugin(writer) + async def __aenter__(self) -> Self: self._register_fixtures() await self._resolver.__aenter__() diff --git a/protest/core/suite.py b/protest/core/suite.py index 1176842..dfb64c3 100644 --- a/protest/core/suite.py +++ b/protest/core/suite.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, TypeVar +from typing import TYPE_CHECKING, Any, TypeVar from protest.di.decorators import unwrap_fixture @@ -42,18 +42,22 @@ class ProTestSuite: description: Optional description for documentation purposes. """ - def __init__( + def __init__( # noqa: PLR0913 self, name: str, max_concurrency: int | None = None, tags: list[str] | None = None, description: str | None = None, + kind: str = "test", + metadata: dict[str, Any] | None = None, ) -> None: if max_concurrency is not None and max_concurrency < 1: raise InvalidMaxConcurrencyError(max_concurrency) self._name = name + self._kind = kind self._description = description + self._metadata: dict[str, Any] = dict(metadata) if metadata else {} self._session: ProTestSession | None = None self._parent_suite: ProTestSuite | None = None self._tests: list[TestRegistration] = [] @@ -70,6 +74,14 @@ def name(self) -> str: def description(self) -> str | None: return self._description + @property + def kind(self) -> str: + return self._kind + + @property + def suite_metadata(self) -> dict[str, Any]: + return self._metadata + @property def full_path(self) -> SuitePath: """Return hierarchical path: Parent::Child::GrandChild.""" @@ -122,6 +134,7 @@ def test( # noqa: PLR0913 - test decorator requires flexible params skip_reason: str = "Skipped", xfail: bool | str | Xfail | None = None, retry: int | Retry | None = None, + is_eval: bool = False, ) -> Callable[[FuncT], FuncT]: def decorator(func: FuncT) -> FuncT: if timeout is not None and timeout < 0: @@ -139,12 +152,34 @@ def decorator(func: FuncT) -> FuncT: xfail=norm_xfail, timeout=timeout, retry=norm_retry, + is_eval=is_eval, ) ) return func return decorator + def eval( + self, + evaluators: list[Any] | None = None, + expected_key: str = "expected", + tags: list[str] | None = None, + timeout: float | None = None, + ) -> Callable[[FuncT], FuncT]: + """Register a scored eval test on this suite.""" + from protest.evals.wrapper import make_eval_wrapper + + def decorator(func: FuncT) -> FuncT: + wrapper = make_eval_wrapper( + func, + evaluators or [], + expected_key, + ) + self.test(tags=tags, timeout=timeout, is_eval=True)(wrapper) + return func + + return decorator + def add_suite(self, suite: ProTestSuite) -> None: """Add a child suite. Child can access parent's fixtures.""" parent_effective = self.effective_max_concurrency diff --git a/protest/evals/__init__.py b/protest/evals/__init__.py new file mode 100644 index 0000000..17b35c9 --- /dev/null +++ b/protest/evals/__init__.py @@ -0,0 +1,28 @@ +"""ProTest evals — native eval support.""" + +from protest.evals.evaluator import EvalCase, EvalContext, Metric, Reason, Verdict, evaluator +from protest.evals.session import EvalSession +from protest.evals.types import ( + EvalCaseResult, + EvalScore, + EvalSuiteReport, + JudgeInfo, + ModelInfo, + ScoreStats, +) + +__all__ = [ + "EvalCase", + "EvalCaseResult", + "EvalContext", + "Metric", + "EvalScore", + "EvalSession", + "EvalSuiteReport", + "JudgeInfo", + "ModelInfo", + "Reason", + "ScoreStats", + "Verdict", + "evaluator", +] diff --git a/protest/evals/evaluator.py b/protest/evals/evaluator.py new file mode 100644 index 0000000..336df8d --- /dev/null +++ b/protest/evals/evaluator.py @@ -0,0 +1,176 @@ +"""Evaluator primitives — functions, not classes. + +An evaluator is a callable that receives an EvalContext and returns a score. +The @evaluator decorator adds partial-application ergonomics: + + @evaluator + def contains_keywords(ctx: EvalContext, keywords: list[str]) -> ContainsKeywordsResult: + found = sum(1 for k in keywords if k.lower() in ctx.output.lower()) + return ContainsKeywordsResult(keyword_recall=found / len(keywords), ...) + + # Bind params → returns a callable(ctx) via functools.partial + evaluators=[contains_keywords(keywords=["paris", "france"])] + + # No params → use directly + @evaluator + def not_empty(ctx: EvalContext) -> bool: + return bool(ctx.output.strip()) + +Async evaluators are supported: + + @evaluator + async def llm_judge(ctx: EvalContext, model: str = "haiku") -> bool: + ... + +Evaluators return either bool (simple verdict) or a dataclass (structured result). +The framework reads fields by type: +- bool → verdict (pass/fail = all(bool_fields)) +- float → metric (aggregated in stats) +- str → reason (displayed on failure) +""" + +from __future__ import annotations + +import asyncio +import dataclasses +import functools +import inspect +from dataclasses import dataclass, field +from typing import Any, Generic, TypeVar + +I = TypeVar("I") +O = TypeVar("O") + + +@dataclass +class EvalContext(Generic[I, O]): + """Context passed to evaluator functions.""" + + name: str + inputs: I + output: O + expected_output: O | None + metadata: Any + duration: float + + +@dataclass +class EvalCase: + """Typed container for eval case data in ForEach. + + Usage:: + + cases = ForEach([ + EvalCase(inputs="Who is Marie?", expected="Marie, Resistance", name="lookup"), + EvalCase(inputs="Who is Pierre?", expected="Pierre, arrest"), + ]) + + @session.eval(evaluators=[contains_facts]) + def my_eval(case: Annotated[EvalCase, From(cases)]) -> str: + return ask(case.inputs) + """ + + inputs: Any + expected: Any = None + name: str = "" + evaluators: list[Any] = field(default_factory=list) + metadata: dict[str, Any] = field(default_factory=dict) + + def __repr__(self) -> str: + return self.name or f"EvalCase({self.inputs!r})" + + +class Metric: + """Annotate a float/int field as a metric for stats aggregation.""" + + +class Verdict: + """Annotate a bool field as a verdict for pass/fail.""" + + +class Reason: + """Annotate a str field as a reason displayed on failure.""" + + +def extract_scores_from_result(result: Any, evaluator_name: str) -> list[Any]: + """Extract EvalScore instances from an evaluator result. + + For bool returns: a single verdict named after the evaluator. + For dataclass returns: only fields annotated with Metric/Verdict/Reason + are extracted. Unannotated fields are ignored (free metadata). + + Raises: + TypeError: If result is not bool or dataclass. + """ + from typing import Annotated, get_args, get_origin, get_type_hints + + from protest.evals.types import EvalScore + + if isinstance(result, bool): + return [EvalScore(name=evaluator_name, value=result)] + + if dataclasses.is_dataclass(result) and not isinstance(result, type): + scores = [] + hints = get_type_hints(type(result), include_extras=True) + for f in dataclasses.fields(result): + ann = hints.get(f.name) + if ann is None or get_origin(ann) is not Annotated: + continue + for meta in get_args(ann)[1:]: + if isinstance(meta, type) and issubclass(meta, (Metric, Verdict, Reason)): + scores.append(EvalScore(name=f.name, value=getattr(result, f.name))) + break + return scores + + type_name = type(result).__name__ + raise TypeError( + f"Evaluator must return bool or dataclass, got {type_name}" + ) + + +def evaluator(fn: Any) -> Any: + """Decorator that turns a function into a protest evaluator. + + The decorated function can be called two ways: + + 1. ``evaluator_fn(ctx)`` — evaluate directly + 2. ``evaluator_fn(keyword=value, ...)`` — returns a bound evaluator (partial) + + This is just ``functools.partial`` with nicer ergonomics: when the first + positional argument is an ``EvalContext``, the function evaluates. Otherwise, + all arguments are bound and the result is a new callable expecting only ``ctx``. + """ + sig = inspect.signature(fn) + params = list(sig.parameters.values()) + has_extra_params = len(params) > 1 + + @functools.wraps(fn) + def wrapper(*args: Any, **kwargs: Any) -> Any: + # Direct call: first positional arg is an EvalContext + if args and isinstance(args[0], EvalContext): + return fn(*args, **kwargs) + # Bind params → return partial + if has_extra_params and kwargs: + bound = functools.partial(fn, **kwargs) + # Preserve async detection on the partial + bound._is_async_evaluator = asyncio.iscoroutinefunction(fn) # type: ignore[attr-defined] + bound.__name__ = fn.__name__ # type: ignore[attr-defined] + bound.__qualname__ = fn.__qualname__ # type: ignore[attr-defined] + return bound + # No args at all — if no extra params, this IS the evaluator + if not has_extra_params and not args and not kwargs: + return fn + return fn(*args, **kwargs) + + wrapper._is_evaluator = True # type: ignore[attr-defined] + wrapper._is_async_evaluator = asyncio.iscoroutinefunction(fn) # type: ignore[attr-defined] + return wrapper + + +def is_async_evaluator(fn: Any) -> bool: + """Check if an evaluator (or partial thereof) is async.""" + if hasattr(fn, "_is_async_evaluator"): + return fn._is_async_evaluator + if isinstance(fn, functools.partial): + return asyncio.iscoroutinefunction(fn.func) + return asyncio.iscoroutinefunction(fn) diff --git a/protest/evals/evaluators.py b/protest/evals/evaluators.py new file mode 100644 index 0000000..b9b1475 --- /dev/null +++ b/protest/evals/evaluators.py @@ -0,0 +1,143 @@ +"""Built-in evaluators for common eval patterns. + +Evaluators return either bool (simple verdict) or a dataclass with +annotated fields: Annotated[bool, Verdict], Annotated[float, Metric], +Annotated[str, Reason]. Unannotated fields are ignored by the runner. +""" + +from __future__ import annotations + +import json as json_module +import re +from dataclasses import dataclass +from typing import Annotated + +from protest.evals.evaluator import EvalContext, Metric, Verdict, evaluator + + +@dataclass(frozen=True, slots=True) +class ContainsKeywordsResult: + keyword_recall: Annotated[float, Metric] + all_keywords_present: Annotated[bool, Verdict] + + +@dataclass(frozen=True, slots=True) +class DoesNotContainResult: + no_forbidden_words: Annotated[bool, Verdict] + + +@dataclass(frozen=True, slots=True) +class MaxLengthResult: + conciseness: Annotated[float, Metric] + within_limit: Annotated[bool, Verdict] + + +@dataclass(frozen=True, slots=True) +class JsonValidResult: + valid_json: Annotated[bool, Verdict] + has_required_keys: Annotated[bool, Verdict] + + +@dataclass(frozen=True, slots=True) +class WordOverlapResult: + overlap: Annotated[float, Metric] + + +@evaluator +def contains_keywords(ctx: EvalContext, keywords: list[str], min_recall: float = 0.0) -> ContainsKeywordsResult: + """Check that the output contains expected keywords (case-insensitive).""" + output_lower = ctx.output.lower() + found = sum(1 for kw in keywords if kw.lower() in output_lower) + total = len(keywords) + recall = found / total if total else 1.0 + return ContainsKeywordsResult( + keyword_recall=recall, + all_keywords_present=recall >= min_recall if min_recall > 0 else found == total, + ) + + +@evaluator +def contains_expected(ctx: EvalContext, case_sensitive: bool = False) -> bool: + """Check that the output contains expected_output as a substring.""" + if ctx.expected_output is None: + return True + if case_sensitive: + return ctx.expected_output in ctx.output + return ctx.expected_output.lower() in ctx.output.lower() + + +@evaluator +def does_not_contain( + ctx: EvalContext, forbidden: list[str], case_sensitive: bool = False +) -> DoesNotContainResult: + """Check that the output does not contain forbidden words.""" + output = ctx.output if case_sensitive else ctx.output.lower() + found = [w for w in forbidden if (w if case_sensitive else w.lower()) in output] + return DoesNotContainResult(no_forbidden_words=len(found) == 0) + + +@evaluator +def not_empty(ctx: EvalContext) -> bool: + """Check that the output is not empty or whitespace-only.""" + if ctx.output is None: + return False + if isinstance(ctx.output, str): + return len(ctx.output.strip()) > 0 + return True + + +@evaluator +def max_length(ctx: EvalContext, max_chars: int = 500) -> MaxLengthResult: + """Check that the output doesn't exceed a character limit.""" + length = len(ctx.output) + return MaxLengthResult( + conciseness=min(1.0, max_chars / max(length, 1)), + within_limit=length <= max_chars, + ) + + +@evaluator +def min_length(ctx: EvalContext, min_chars: int = 1) -> bool: + """Check that the output meets a minimum length.""" + return len(ctx.output) >= min_chars + + +@evaluator +def matches_regex(ctx: EvalContext, pattern: str, flags: int = 0) -> bool: + """Check that the output matches a regex pattern.""" + return bool(re.search(pattern, ctx.output, flags)) + + +@evaluator +def json_valid( + ctx: EvalContext, required_keys: list[str] | None = None +) -> JsonValidResult: + """Check that the output is valid JSON, optionally with required keys.""" + if required_keys is None: + required_keys = [] + try: + parsed = json_module.loads(ctx.output) + except (json_module.JSONDecodeError, TypeError): + return JsonValidResult(valid_json=False, has_required_keys=False) + + has_keys = ( + all(k in parsed for k in required_keys) + if required_keys and isinstance(parsed, dict) + else True + ) + return JsonValidResult(valid_json=True, has_required_keys=has_keys) + + +@evaluator +def word_overlap(ctx: EvalContext) -> WordOverlapResult: + """Compute word overlap between output and expected_output (tracking-only).""" + if ctx.expected_output is None: + return WordOverlapResult(overlap=1.0) + expected = str(ctx.expected_output) + expected_words = set(expected.lower().split()) + output_words = set(ctx.output.lower().split()) + if not expected_words: + return WordOverlapResult(overlap=1.0) + return WordOverlapResult( + overlap=len(expected_words & output_words) / len(expected_words), + ) diff --git a/protest/evals/hashing.py b/protest/evals/hashing.py new file mode 100644 index 0000000..0f0f5e9 --- /dev/null +++ b/protest/evals/hashing.py @@ -0,0 +1,51 @@ +"""Content hashing for eval cases — detect when cases or scoring change.""" + +from __future__ import annotations + +import dataclasses +import hashlib +import json +from typing import Any + +HASH_LENGTH = 12 + + +def compute_case_hash(inputs: Any, expected_output: Any) -> str: + """Hash the case content (inputs + expected_output).""" + data = {"inputs": _canonical(inputs), "expected": _canonical(expected_output)} + return _hash(data) + + +def compute_eval_hash( + evaluators: list[Any], +) -> str: + """Hash the scoring config (evaluators only).""" + data = { + "evaluators": [_canonical(e) for e in evaluators], + } + return _hash(data) + + +def _hash(data: Any) -> str: + raw = json.dumps(data, sort_keys=True, default=str) + return hashlib.sha256(raw.encode()).hexdigest()[:HASH_LENGTH] + + +def _canonical(obj: Any) -> Any: + """Convert an object to a canonical JSON-serializable form.""" + if obj is None or isinstance(obj, (bool, int, float, str)): + return obj + if isinstance(obj, (list, tuple)): + return [_canonical(item) for item in obj] + if isinstance(obj, dict): + return {str(k): _canonical(v) for k, v in sorted(obj.items())} + # Pydantic models + if hasattr(obj, "model_dump"): + return _canonical(obj.model_dump(mode="json")) + # Dataclasses — iterate without deepcopy to support non-picklable fields + if dataclasses.is_dataclass(obj) and not isinstance(obj, type): + return { + f.name: _canonical(getattr(obj, f.name)) for f in dataclasses.fields(obj) + } + # Fallback + return repr(obj) diff --git a/protest/evals/history.py b/protest/evals/history.py new file mode 100644 index 0000000..f7f2544 --- /dev/null +++ b/protest/evals/history.py @@ -0,0 +1,159 @@ +"""EvalHistoryPlugin — persists eval run results as JSONL with model/scores.""" + +from __future__ import annotations + +import uuid +from datetime import datetime, timezone +from typing import TYPE_CHECKING, Any + +from protest.history.collector import collect_env_info, collect_git_info +from protest.history.storage import DEFAULT_HISTORY_DIR, HISTORY_FILE, append_entry +from protest.plugin import PluginBase + +if TYPE_CHECKING: + from pathlib import Path + + from protest.evals.types import EvalCaseResult, EvalSuiteReport, ModelInfo + from protest.plugin import PluginContext + + +class EvalHistoryPlugin(PluginBase): + """Persists eval results to JSONL with model/judge/scores metadata. + + Listens to EVAL_SUITE_END events (emitted by the core runner). + """ + + name = "eval-history" + description = "Eval history tracking" + + def __init__( + self, + *, + history_dir: Path | None = None, + model: ModelInfo | None = None, + judge: dict[str, Any] | None = None, + metadata: dict[str, Any] | None = None, + ) -> None: + self._history_dir = history_dir or DEFAULT_HISTORY_DIR + self._history_file = self._history_dir / HISTORY_FILE + self._model = model + self._judge = judge + self._metadata = dict(metadata) if metadata else {} + self._reports: dict[str, EvalSuiteReport] = {} + + _suite_metadata: dict[str, dict[str, Any]] + + @classmethod + def activate(cls, ctx: PluginContext) -> EvalHistoryPlugin | None: + return None # Wired explicitly by session + + def setup(self, session: Any) -> None: + """Collect per-suite metadata from session.""" + self._suite_metadata = {} + for suite in getattr(session, "suites", []): + if getattr(suite, "kind", "test") == "eval": + self._suite_metadata[suite.name] = getattr(suite, "suite_metadata", {}) + + def on_eval_suite_end(self, report: EvalSuiteReport) -> None: + """Collect suite reports as they arrive.""" + self._reports[report.suite_name] = report + + def on_session_end(self, _result: Any) -> None: + """Write all collected reports to history.""" + if not self._reports: + return + entry = _build_entry( + self._reports, + self._model, + self._judge, + self._metadata, + self._suite_metadata, + ) + append_entry(self._history_file, entry) + + def load_entries(self, n: int | None = None) -> list[dict[str, Any]]: + """Load entries from history file.""" + from protest.history.storage import load_history + + return load_history(history_dir=self._history_dir, n=n, evals_only=True) + + +def _build_entry( + reports: dict[str, EvalSuiteReport], + model: ModelInfo | None, + judge: dict[str, Any] | None, + metadata: dict[str, Any] | None = None, + all_suite_metadata: dict[str, dict[str, Any]] | None = None, +) -> dict[str, Any]: + """Build a complete history entry covering all suites in the session.""" + suites_data: dict[str, Any] = {} + all_score_stats: list[Any] = [] + + for suite_name, report in reports.items(): + sm = (all_suite_metadata or {}).get(suite_name, {}) + suite_model = sm.get("model") or (model.name if model else None) + suite_provider = sm.get("provider") or (model.provider if model else None) + suites_data[suite_name] = { + "kind": "eval", + "model": suite_model, + "provider": suite_provider, + "total_cases": report.total_count, + "passed": report.passed_count, + "failed": report.failed_count, + "pass_rate": round(report.pass_rate, 4), + "duration": round(report.duration, 2), + "cases": {c.case_name: _serialize_case(c) for c in report.cases}, + } + all_score_stats.extend(report.all_score_stats()) + + scores_summary = { + s.name: { + "mean": round(s.mean, 4), + "median": round(s.median, 4), + "p5": round(s.p5, 4), + "p95": round(s.p95, 4), + "min": round(s.min, 4), + "max": round(s.max, 4), + "count": s.count, + } + for s in all_score_stats + } + + return { + "run_id": str(uuid.uuid4()), + "timestamp": datetime.now(tz=timezone.utc).isoformat(), + "git": collect_git_info(), + "environment": collect_env_info(), + "metadata": dict(metadata) if metadata else {}, + "evals": { + "model": model.name if model else None, + "provider": model.provider if model else None, + "judge": judge, + "scores_summary": scores_summary, + }, + "suites": suites_data, + } + + +def _serialize_case(case: EvalCaseResult) -> dict[str, Any]: + entry: dict[str, Any] = { + "passed": case.passed, + "duration": round(case.duration, 3), + "scores": {s.name: s.value for s in case.scores if s.is_metric}, + "case_hash": case.case_hash, + "eval_hash": case.eval_hash, + } + labels = {s.name: s.value for s in case.scores if isinstance(s.value, str)} + if labels: + entry["labels"] = labels + assertions = {s.name: s.value for s in case.scores if isinstance(s.value, bool)} + if assertions: + entry["assertions"] = assertions + return entry + + +def load_previous_run(history_dir: Any = None) -> dict[str, Any] | None: + """Load the most recent eval run from history.""" + from protest.history.storage import load_previous_run as _load + + return _load(history_dir=history_dir, evals_only=True) diff --git a/protest/evals/results_writer.py b/protest/evals/results_writer.py new file mode 100644 index 0000000..0054e25 --- /dev/null +++ b/protest/evals/results_writer.py @@ -0,0 +1,155 @@ +"""EvalResultsWriter — writes per-case eval results as markdown files. + +Listens to TEST_PASS/FAIL events, filters for eval cases, and writes +a markdown file for each case to .protest/results/_/. +""" + +from __future__ import annotations + +import re +from datetime import datetime, timezone +from pathlib import Path +from typing import TYPE_CHECKING, Any + +from protest.plugin import PluginBase + +if TYPE_CHECKING: + from protest.entities.events import TestResult + from protest.evals.types import EvalCaseResult, EvalScore + from protest.plugin import PluginContext + +DEFAULT_RESULTS_DIR = Path(".protest") / "results" + + +class EvalResultsWriter(PluginBase): + """Writes per-case eval result files as markdown.""" + + name = "eval-results-writer" + description = "Write eval case result files" + + def __init__(self, history_dir: Path | None = None) -> None: + self._results_base = ( + (history_dir / "results") if history_dir else DEFAULT_RESULTS_DIR + ) + self._run_dirs: dict[str, Path] = {} + + @classmethod + def activate(cls, ctx: PluginContext) -> EvalResultsWriter | None: + return None # Wired explicitly by session + + def on_test_pass(self, result: TestResult) -> None: + self._maybe_write(result, passed=True) + + def on_test_fail(self, result: TestResult) -> None: + self._maybe_write(result, passed=False) + + def _maybe_write(self, result: TestResult, *, passed: bool) -> None: + if not result.is_eval or result.eval_payload is None: + return + suite_name = result.suite_path.root_name if result.suite_path else "evals" + case_result = _build_case_result(result, passed) + self._write_case_file(case_result, suite_name) + + def _write_case_file(self, case_result: EvalCaseResult, suite_name: str) -> None: + if suite_name not in self._run_dirs: + self._run_dirs[suite_name] = _make_run_dir(suite_name, self._results_base) + _write_case_file(case_result, self._run_dirs[suite_name]) + + def on_eval_suite_end(self, report: Any) -> None: + """Print results dir path for the suite.""" + from protest.evals.types import EvalSuiteReport + + if not isinstance(report, EvalSuiteReport): + return + run_dir = self._run_dirs.get(report.suite_name) + if run_dir: + print(f" Results: {run_dir}") + + +def _build_case_result(result: TestResult, passed: bool) -> EvalCaseResult: + """Build EvalCaseResult from a TestResult with eval_payload.""" + from protest.evals.types import EvalCaseResult, EvalScore + + payload = result.eval_payload + assert payload is not None + return EvalCaseResult( + case_name=payload.case_name or "", + node_id=result.node_id, + scores=tuple( + EvalScore( + name=name, + value=entry.value, + ) + for name, entry in payload.scores.items() + ), + duration=payload.task_duration, + passed=passed, + inputs=payload.inputs, + output=payload.output, + expected_output=payload.expected_output, + case_hash=payload.case_hash, + eval_hash=payload.eval_hash, + ) + + +# --------------------------------------------------------------------------- +# File writing helpers +# --------------------------------------------------------------------------- + + +def _make_run_dir(suite_name: str, base_dir: Path | None = None) -> Path: + """Create and return the timestamped directory for this run.""" + base = base_dir or DEFAULT_RESULTS_DIR + ts = datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S") + safe_suite = re.sub(r"[^\w\-]", "_", suite_name) + run_dir = base / f"{safe_suite}_{ts}" + run_dir.mkdir(parents=True, exist_ok=True) + return run_dir + + +def _write_case_file(case: EvalCaseResult, run_dir: Path) -> None: + """Write a markdown file for a single eval case.""" + safe_name = re.sub(r"[^\w\-]", "_", case.case_name) + path = run_dir / f"{safe_name}.md" + path.write_text(_render_case(case), encoding="utf-8") + + +def _render_case(case: EvalCaseResult) -> str: + status = "PASS ✓" if case.passed else "FAIL ✗" + duration = ( + f"{case.duration * 1000:.0f}ms" + if case.duration < 1 + else f"{case.duration:.2f}s" + ) + lines: list[str] = [ + f"# {case.case_name} — {status} ({duration})", + "", + ] + + lines += ["## Input", "", _format_value(case.inputs), ""] + lines += ["## Output", "", _format_value(case.output), ""] + lines += ["## Expected", "", _format_value(case.expected_output), ""] + + if case.scores: + lines += ["## Scores", ""] + for score in case.scores: + lines.append(_format_score(score)) + lines.append("") + + return "\n".join(lines) + + +def _format_score(score: EvalScore) -> str: + if score.is_metric: + icon = "·" + else: + icon = "✓" if score.passed else "✗" + return f"- **{score.name}**: {score.value} {icon}" + + +def _format_value(value: Any) -> str: + if value is None: + return "_none_" + if isinstance(value, str): + return value if value.strip() else "_empty string_" + return f"```\n{value!r}\n```" diff --git a/protest/evals/session.py b/protest/evals/session.py new file mode 100644 index 0000000..82bea35 --- /dev/null +++ b/protest/evals/session.py @@ -0,0 +1,44 @@ +"""EvalSession — session dédiée aux evals.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from protest.core.session import ProTestSession + +if TYPE_CHECKING: + from pathlib import Path + + from protest.evals.types import JudgeInfo, ModelInfo + + +class EvalSession(ProTestSession): + """Session dédiée aux evals. + + Usage:: + + session = EvalSession(model=ModelInfo(name="qwen-2.5")) + + @session.eval(evaluators=[contains_facts]) + async def chatbot(case: Annotated[dict, From(cases)]) -> str: + return await ask(case["q"]) + """ + + def __init__( + self, + *, + model: ModelInfo | None = None, + judge: JudgeInfo | None = None, + concurrency: int = 1, + history: bool = True, + history_dir: Path | None = None, + metadata: dict[str, Any] | None = None, + ) -> None: + super().__init__( + concurrency=concurrency, + history=history, + history_dir=history_dir, + metadata=metadata, + ) + self._eval_model = model + self._eval_judge = judge diff --git a/protest/evals/types.py b/protest/evals/types.py new file mode 100644 index 0000000..24082f1 --- /dev/null +++ b/protest/evals/types.py @@ -0,0 +1,168 @@ +"""Types for eval results, scores, and run context.""" + +from __future__ import annotations + +import statistics +from dataclasses import dataclass, field +from typing import Any + + +@dataclass(frozen=True, slots=True) +class ModelInfo: + """Metadata about the model being evaluated.""" + + name: str + provider: str | None = None + temperature: float | None = None + extra: dict[str, Any] = field(default_factory=dict) + + @classmethod + def from_agent(cls, agent: Any) -> ModelInfo: + """Extract model info from a pydantic-ai Agent (duck-typed).""" + model = getattr(agent, "model", None) + if model is None: + msg = "Agent has no model configured" + raise ValueError(msg) + if isinstance(model, str): + return cls(name=model) + model_name = getattr(model, "model_name", None) + if callable(model_name): + return cls(name=str(model_name())) + return cls(name=str(getattr(model, "name", None) or model)) + + +@dataclass(frozen=True, slots=True) +class JudgeInfo: + """Metadata about the LLM judge used for evaluation.""" + + name: str + provider: str | None = None + evaluators: tuple[str, ...] = () + extra: dict[str, Any] = field(default_factory=dict) + + +@dataclass(frozen=True, slots=True) +class EvalScore: + """A single named value from an evaluator result. + + Values are categorized by type: + - bool → verdict (pass/fail) + - float → metric (aggregated in stats) + - str → reason (displayed on failure) + """ + + name: str + value: float | bool | str + + @property + def is_verdict(self) -> bool: + return isinstance(self.value, bool) + + @property + def is_metric(self) -> bool: + return isinstance(self.value, (int, float)) and not isinstance(self.value, bool) + + @property + def is_reason(self) -> bool: + return isinstance(self.value, str) + + @property + def passed(self) -> bool: + if isinstance(self.value, bool): + return self.value + return True # metrics and reasons always "pass" + + +@dataclass(frozen=True, slots=True) +class EvalCaseResult: + """Complete result of evaluating a single case.""" + + case_name: str + node_id: str + scores: tuple[EvalScore, ...] + duration: float + passed: bool + inputs: Any = None + output: Any = None + expected_output: Any = None + case_hash: str = "" + eval_hash: str = "" + + @property + def numeric_scores(self) -> dict[str, float]: + return {s.name: float(s.value) for s in self.scores if s.is_metric} + + @property + def failed_scores(self) -> tuple[EvalScore, ...]: + return tuple(s for s in self.scores if not s.passed) + + +@dataclass(frozen=True, slots=True) +class ScoreStats: + """Aggregated statistics for a named score across cases.""" + + name: str + mean: float + median: float + p5: float + p95: float + min: float + max: float + count: int + + @classmethod + def from_values(cls, name: str, values: list[float]) -> ScoreStats: + if not values: + return cls(name=name, mean=0, median=0, p5=0, p95=0, min=0, max=0, count=0) + sv = sorted(values) + n = len(sv) + return cls( + name=name, + mean=statistics.mean(sv), + median=statistics.median(sv), + p5=sv[max(0, int(n * 0.05))], + p95=sv[min(n - 1, int(n * 0.95))], + min=sv[0], + max=sv[-1], + count=n, + ) + + +@dataclass(frozen=True, slots=True) +class EvalSuiteReport: + """Aggregated report for a suite of eval cases.""" + + suite_name: str + cases: tuple[EvalCaseResult, ...] + duration: float + + @property + def passed_count(self) -> int: + return sum(1 for c in self.cases if c.passed) + + @property + def failed_count(self) -> int: + return sum(1 for c in self.cases if not c.passed) + + @property + def total_count(self) -> int: + return len(self.cases) + + @property + def pass_rate(self) -> float: + return self.passed_count / self.total_count if self.cases else 0.0 + + def score_names(self) -> set[str]: + return {s.name for c in self.cases for s in c.scores if s.is_metric} + + def score_stats(self, name: str) -> ScoreStats: + values = [ + float(s.value) + for c in self.cases + for s in c.scores + if s.name == name and s.is_metric + ] + return ScoreStats.from_values(name, values) + + def all_score_stats(self) -> list[ScoreStats]: + return [self.score_stats(n) for n in sorted(self.score_names())] diff --git a/protest/evals/wrapper.py b/protest/evals/wrapper.py new file mode 100644 index 0000000..c9087b6 --- /dev/null +++ b/protest/evals/wrapper.py @@ -0,0 +1,176 @@ +"""Eval wrapper — turns a function into a scored eval test. + +The wrapper intercepts the return value, runs evaluators, and returns +an EvalPayload. The rest of the pipeline (executor, outcome builder, +reporters) handles it like any eval test. +""" + +from __future__ import annotations + +import asyncio +import functools +import time +from typing import Any + +from protest.entities.events import EvalPayload, EvalScoreEntry +from protest.evals.evaluator import EvalContext, extract_scores_from_result +from protest.evals.types import EvalScore + + +def make_eval_wrapper( + func: Any, + evaluators: list[Any], + expected_key: str, +) -> Any: + """Wrap a function to run evaluators on its return value.""" + + @functools.wraps(func) + async def eval_wrapper(**kwargs: Any) -> EvalPayload: + expected = _extract_expected(kwargs, expected_key) + case_name = _extract_case_name(kwargs, func.__name__) + inputs = _extract_inputs(kwargs) + metadata = _extract_metadata(kwargs) + + start = time.perf_counter() + if asyncio.iscoroutinefunction(func): + output = await func(**kwargs) + else: + output = func(**kwargs) + task_duration = time.perf_counter() - start + + all_evaluators = list(evaluators) + per_case = _extract_per_case_evaluators(kwargs) + all_evaluators.extend(per_case) + + scores = await run_evaluators( + all_evaluators, + case_name, + inputs, + output, + expected, + metadata, + task_duration, + ) + + from protest.evals.hashing import compute_case_hash, compute_eval_hash + + return EvalPayload( + case_name=case_name, + passed=all(s.passed for s in scores), + task_duration=task_duration, + inputs=inputs, + output=output, + expected_output=expected, + scores={ + s.name: EvalScoreEntry( + value=s.value, + passed=s.passed, + ) + for s in scores + }, + case_hash=compute_case_hash(inputs, expected), + eval_hash=compute_eval_hash(all_evaluators), + ) + + return eval_wrapper + + +# --------------------------------------------------------------------------- +# Extract helpers — pull data from case_kwargs (dict or dataclass) +# --------------------------------------------------------------------------- + + +def _get(obj: Any, key: str, default: Any = None) -> Any: + """Get a value from a dict or dataclass by key/attr name.""" + if isinstance(obj, dict): + return obj.get(key, default) + return getattr(obj, key, default) + + +def _is_case_data(v: Any) -> bool: + """Check if a value looks like case data (dict or has 'expected'/'q'/'inputs').""" + if isinstance(v, dict): + return True + return hasattr(v, "expected") or hasattr(v, "q") or hasattr(v, "inputs") + + +def _extract_expected(kwargs: dict[str, Any], key: str) -> Any: + for v in kwargs.values(): + if _is_case_data(v): + val = _get(v, key) + if val is not None: + return val + return None + + +def _extract_case_name(kwargs: dict[str, Any], fallback: str) -> str: + for v in kwargs.values(): + if _is_case_data(v): + name = _get(v, "name") + if name: + return name + return fallback + + +def _extract_inputs(kwargs: dict[str, Any]) -> Any: + for v in kwargs.values(): + if _is_case_data(v): + return _get(v, "inputs") or _get(v, "q") or _get(v, "input") + return None + + +def _extract_metadata(kwargs: dict[str, Any]) -> Any: + for v in kwargs.values(): + if _is_case_data(v): + val = _get(v, "metadata") + if val is not None: + return val + return None + + +def _extract_per_case_evaluators(kwargs: dict[str, Any]) -> list[Any]: + for v in kwargs.values(): + if _is_case_data(v): + evs = _get(v, "evaluators") + if evs: + return list(evs) + return [] + + +# --------------------------------------------------------------------------- +# Evaluator execution +# --------------------------------------------------------------------------- + + +async def run_evaluators( + evaluators: list[Any], + case_name: str, + inputs: Any, + output: Any, + expected_output: Any, + metadata: Any, + duration: float, +) -> list[EvalScore]: + """Run evaluators and convert results to EvalScores.""" + ctx = EvalContext( + name=case_name, + inputs=inputs, + output=output, + expected_output=expected_output, + metadata=metadata, + duration=duration, + ) + + scores: list[EvalScore] = [] + for ev in evaluators: + evaluator_name = getattr(ev, "__name__", type(ev).__name__) + try: + raw = ev(ctx) + result = await raw if asyncio.iscoroutine(raw) else raw + scores.extend(extract_scores_from_result(result, evaluator_name)) + except Exception as exc: + from protest.exceptions import FixtureError + + raise FixtureError(f"evaluator '{evaluator_name}'", exc) from exc + + return scores diff --git a/protest/filters/kind.py b/protest/filters/kind.py new file mode 100644 index 0000000..859e7dd --- /dev/null +++ b/protest/filters/kind.py @@ -0,0 +1,36 @@ +"""KindFilterPlugin — filters tests by suite kind (test/eval).""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from protest.plugin import PluginBase + +if TYPE_CHECKING: + from protest.entities import TestItem + from protest.plugin import PluginContext + + +class KindFilterPlugin(PluginBase): + """Filters collected tests by suite kind ('test' or 'eval').""" + + name = "kind-filter" + description = "Filter by suite kind" + + def __init__(self, kind: str) -> None: + self._kind = kind + + @classmethod + def activate(cls, ctx: PluginContext) -> KindFilterPlugin | None: + kind = ctx.get("kind_filter") + if kind: + return cls(kind=kind) + return None + + def on_collection_finish(self, items: list[TestItem]) -> list[TestItem]: + return [item for item in items if self._matches(item)] + + def _matches(self, item: TestItem) -> bool: + if item.suite is None: + return self._kind == "test" + return item.suite.kind == self._kind From 4310e577e1234a57589a94a00609d12264e8fab8 Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Fri, 27 Mar 2026 19:00:00 +0100 Subject: [PATCH 03/60] feat(reporters): Rich eval table, multi-model history, console.print - on_eval_suite_end: Rich table for scores, plain text for ASCII - Scores inline in -v, --show-output for inputs/output/expected - --show-logs flag for captured log records - Fixture setup time always displayed - protest history --runs: per-suite breakdown with model - protest.console.print(): progress output bypassing capture - Lifecycle messages bypass capture (no re-display on fail) - Output truncated at 20 lines with pointer to full output - Case id in lifecycle messages (chatbot[lookup] not chatbot) --- protest/__init__.py | 1 + protest/cli/history.py | 522 +++++++++++++++++++++++++++++ protest/cli/main.py | 60 ++-- protest/console.py | 70 ++++ protest/reporting/ascii.py | 48 ++- protest/reporting/rich_reporter.py | 202 ++++++++++- 6 files changed, 859 insertions(+), 44 deletions(-) create mode 100644 protest/cli/history.py create mode 100644 protest/console.py diff --git a/protest/__init__.py b/protest/__init__.py index 6590311..9317c9f 100644 --- a/protest/__init__.py +++ b/protest/__init__.py @@ -1,3 +1,4 @@ +from protest import console from protest.api import collect_tests, list_tags, run_session from protest.assertions import ExceptionInfo, RaisesContext, raises, warns from protest.core.session import ProTestSession diff --git a/protest/cli/history.py b/protest/cli/history.py new file mode 100644 index 0000000..f9eb7ac --- /dev/null +++ b/protest/cli/history.py @@ -0,0 +1,522 @@ +"""CLI command: protest history — browse run history.""" + +from __future__ import annotations + +import argparse +import sys +from typing import Any + + +def handle_history_command(argv: list[str]) -> None: + """Entry point for `protest history`.""" + parser = argparse.ArgumentParser( + prog="protest history", description="Browse run history" + ) + parser.add_argument( + "--tail", "-n", type=int, default=10, help="Number of entries (default: 10)" + ) + parser.add_argument("--model", type=str, default=None, help="Filter by model name") + parser.add_argument("--suite", type=str, default=None, help="Filter by suite name") + parser.add_argument("--runs", action="store_true", help="Show run-by-run list") + parser.add_argument( + "--show", + nargs="?", + const=0, + type=int, + default=None, + metavar="N", + help="Detailed panel for Nth most recent run (0=latest)", + ) + parser.add_argument( + "--compare", action="store_true", help="Compare 2 most recent runs" + ) + parser.add_argument("--evals", action="store_true", help="Eval runs only") + parser.add_argument("--tests", action="store_true", help="Test runs only") + parser.add_argument( + "--clean-dirty", + action="store_true", + help="Remove runs with uncommitted changes on current commit.", + ) + parser.add_argument( + "--path", type=str, default=None, help="History directory (default: .protest/)" + ) + + args = parser.parse_args(argv) + from pathlib import Path + + from protest.history.storage import clean_dirty, load_history + + history_dir = Path(args.path) if args.path else None + + if args.clean_dirty: + removed = clean_dirty(history_dir=history_dir) + print( + f"Removed {removed} dirty entries." + if removed + else "No dirty entries to clean." + ) + sys.exit(0) + + entries = load_history( + history_dir=history_dir, + model=args.model, + suite=args.suite, + evals_only=args.evals, + tests_only=args.tests, + ) + if not entries: + print("No history found.") + sys.exit(0) + + out = _get_output() + if args.compare: + if len(entries) < 2: + print("Need at least 2 runs to compare.") + sys.exit(1) + out.compare(entries[-1], entries[-2]) + elif args.show is not None: + idx = args.show + if idx >= len(entries): + print(f"Only {len(entries)} entries available.") + sys.exit(1) + out.detail(entries[-(idx + 1)]) + elif args.runs: + out.runs(entries[-args.tail :]) + else: + out.stats(entries) + + +# --------------------------------------------------------------------------- +# Output abstraction — Rich if available, plain text fallback +# --------------------------------------------------------------------------- + + +class _Output: + """Base output — plain text.""" + + def stats(self, entries: list[dict[str, Any]]) -> None: + suites = _aggregate_suites(entries) + if not suites: + print("No suite data found.") + return + print(f"\n {'Suite':<22} {'Kind':<6} {'Runs':>4} {'Pass rate':<16} {'Flaky'}") + for name in sorted(suites): + s = suites[name] + rate_str = _format_rate(s["pass_rates"]) + flaky_n = len(s["flaky"]) + print( + f" {name:<22} {s['kind']:<6} {s['n_runs']:>4} {rate_str:<16} {flaky_n or ''}" + ) + print() + + def runs(self, entries: list[dict[str, Any]]) -> None: + for i, e in enumerate(entries): + p, t, r = _entry_stats(e) + git = (e.get("git") or {}).get("commit_short", "?") + ts = e.get("timestamp", "?")[:16] + print(f"\n #{len(entries) - i:<3} {ts} {p}/{t} ({r * 100:.0f}%) {git}") + for sn, sd in e.get("suites", {}).items(): + if not isinstance(sd, dict): + continue + sp = sd.get("passed", 0) + st = sd.get("total_cases", 0) + sr = sp / st * 100 if st else 0 + model = sd.get("model") or "-" + print(f" {sn:<20} {sp}/{st} ({sr:.0f}%) {model}") + print() + + def detail(self, entry: dict[str, Any]) -> None: + kind = "EVAL" if entry.get("evals") else "TEST" + git = entry.get("git") or {} + ts = entry.get("timestamp", "?")[:19] + print( + f"\n {kind} run {ts} {git.get('commit_short', '?')} @ {git.get('branch', '?')}" + ) + for sn, sd in entry.get("suites", {}).items(): + if not isinstance(sd, dict): + continue + suite_model = sd.get("model") + model_str = f" [{suite_model}]" if suite_model else "" + print( + f"\n Suite: {sn} {sd.get('passed', 0)}/{sd.get('total_cases', 0)}{model_str}" + ) + for cn, cd in sd.get("cases", {}).items(): + if not isinstance(cd, dict): + continue + m = "+" if cd.get("passed") else "-" + print(f" {m} {cn} ({_fmt_dur(cd.get('duration', 0))})") + print() + + def compare(self, current: dict[str, Any], previous: dict[str, Any]) -> None: + cm = _get_display_model(current) + pm = _get_display_model(previous) + _, _, cr = _entry_stats(current) + _, _, pr = _entry_stats(previous) + if cm == pm: + print(f"\n Model: {cm}") + else: + print(f"\n Model: {pm} → {cm}") + print(f" Pass rate: {pr * 100:.0f}% → {cr * 100:.0f}%") + changes = _classify_changes(_all_cases(current), _all_cases(previous)) + _print_changes(changes) + + +class _RichOutput(_Output): + """Rich output with colors, tables, panels.""" + + def __init__(self) -> None: + from rich.console import Console + + self.console = Console(highlight=False) + + def stats(self, entries: list[dict[str, Any]]) -> None: + from rich.table import Table + + suites = _aggregate_suites(entries) + if not suites: + self.console.print("No suite data found.") + return + table = Table(show_header=True, header_style="bold", box=None, pad_edge=False) + table.add_column("Suite", min_width=12, no_wrap=True) + table.add_column("Kind", width=5) + table.add_column("Runs", justify="right", width=4) + table.add_column("Pass rate", min_width=14, no_wrap=True) + table.add_column("Scores", no_wrap=True) + table.add_column("Flaky", width=5) + + for name in sorted(suites): + s = suites[name] + kind = s["kind"] + kind_color = "cyan" if kind == "eval" else "blue" + rate_str = _rich_rate(s["pass_rates"]) + score_arrows = _rich_score_arrows(s.get("score_values", {})) + flaky_n = len(s["flaky"]) + flaky_str = f"[yellow]{flaky_n}[/]" if flaky_n else "" + table.add_row( + name, + f"[{kind_color}]{kind}[/]", + str(s["n_runs"]), + rate_str, + score_arrows, + flaky_str, + ) + + self.console.print() + self.console.print(table) + self.console.print() + + def runs(self, entries: list[dict[str, Any]]) -> None: + self.console.print() + for i, e in enumerate(entries): + p, t, r = _entry_stats(e) + git = (e.get("git") or {}).get("commit_short", "?") + ts = e.get("timestamp", "?")[:16] + rate_color = "green" if r >= 0.8 else "yellow" if r >= 0.5 else "red" + self.console.print( + f" [dim]#{len(entries) - i:<3}[/] {ts} " + f"[{rate_color}]{p}/{t} ({r * 100:.0f}%)[/] [dim]{git}[/]" + ) + for sn, sd in e.get("suites", {}).items(): + if not isinstance(sd, dict): + continue + sp = sd.get("passed", 0) + st = sd.get("total_cases", 0) + sr = sp / st * 100 if st else 0 + sc = "green" if sr >= 80 else "yellow" if sr >= 50 else "red" + model = sd.get("model") or "-" + self.console.print( + f" {sn:<20} [{sc}]{sp}/{st} ({sr:.0f}%)[/] [cyan]{model}[/]" + ) + self.console.print() + + def detail(self, entry: dict[str, Any]) -> None: + from rich.panel import Panel + from rich.text import Text + + kind = "EVAL" if entry.get("evals") else "TEST" + git = entry.get("git") or {} + ts = entry.get("timestamp", "?")[:19] + evals_info = entry.get("evals") or {} + + lines = Text() + lines.append(f"{kind} run", style="bold") + lines.append(f" {ts} ", style="dim") + lines.append( + f"{git.get('commit_short', '?')} @ {git.get('branch', '?')}\n", style="dim" + ) + + # Scores summary + for sn, stats in evals_info.get("scores_summary", {}).items(): + mean = stats.get("mean", 0) + color = "green" if mean >= 0.8 else "yellow" if mean >= 0.5 else "red" + lines.append(f" {sn}: ", style="dim") + lines.append(f"mean={mean:.2f}", style=color) + lines.append( + f" p50={stats.get('median', 0):.2f} p95={stats.get('p95', 0):.2f}\n", + style="dim", + ) + + for sn, sd in entry.get("suites", {}).items(): + if not isinstance(sd, dict): + continue + p, t = sd.get("passed", 0), sd.get("total_cases", 0) + lines.append("\nSuite: ", style="bold") + lines.append(sn) + pc = "green" if p == t else "yellow" if p >= t * 0.5 else "red" + lines.append(f" {p}/{t}", style=pc) + suite_model = sd.get("model") + if suite_model: + lines.append(f" [{suite_model}]", style="cyan") + lines.append(f" {_fmt_dur(sd.get('duration', 0))}\n", style="dim") + for cn, cd in sd.get("cases", {}).items(): + if not isinstance(cd, dict): + continue + if cd.get("passed"): + lines.append(" + ", style="green") + else: + lines.append(" - ", style="red") + lines.append(cn) + lines.append(f" ({_fmt_dur(cd.get('duration', 0))})\n", style="dim") + + self.console.print() + self.console.print( + Panel(lines, title="[bold]Run Detail[/]", border_style="cyan") + ) + + def compare(self, current: dict[str, Any], previous: dict[str, Any]) -> None: + from rich.panel import Panel + from rich.text import Text + + cm = _get_display_model(current) + pm = _get_display_model(previous) + _, _, cr = _entry_stats(current) + _, _, pr = _entry_stats(previous) + delta = cr - pr + + lines = Text() + if cm == pm: + lines.append(f"Model: {cm}\n", style="cyan") + else: + lines.append(f"Model: {pm} → {cm}\n", style="cyan") + + lines.append("Pass rate: ") + lines.append(f"{pr * 100:.0f}%", style="dim") + lines.append(" → ") + rc = "green" if delta > 0 else "red" if delta < 0 else "" + lines.append(f"{cr * 100:.0f}%", style=rc) + if abs(delta) >= 0.001: + lines.append(f" ({delta * 100:+.0f}%)", style=rc) + lines.append("\n\n") + + changes = _classify_changes(_all_cases(current), _all_cases(previous)) + labels = [ + ("fixed", "Fixed", "green", "+"), + ("regressed", "Regressions", "red", "-"), + ("modified", "Modified", "yellow", "⟳"), + ("new", "New", "cyan", "*"), + ] + has_any = False + for key, label, color, marker in labels: + items = changes[key] + if items: + has_any = True + lines.append(f"{label} ({len(items)}):\n", style=color) + for n in items: + lines.append(f" {marker} {n}\n") + lines.append("\n") + if not has_any: + lines.append("No changes.\n", style="dim") + + self.console.print() + self.console.print( + Panel(lines, title="[bold]Run Comparison[/]", border_style="cyan") + ) + + +def _get_output() -> _Output: + try: + return _RichOutput() + except ImportError: + return _Output() + + +# --------------------------------------------------------------------------- +# Rich helpers +# --------------------------------------------------------------------------- + + +def _rich_rate(rates: list[float]) -> str: + if len(rates) >= 2: + first, last = rates[0], rates[-1] + delta = last - first + if delta > 0.01: + return f"[dim]{first * 100:.0f}%[/] [green]↗ {last * 100:.0f}%[/]" + if delta < -0.01: + return f"[dim]{first * 100:.0f}%[/] [red]↘ {last * 100:.0f}%[/]" + return f"{last * 100:.0f}%" + if rates: + return f"{rates[0] * 100:.0f}%" + return "-" + + +def _rich_score_arrows(score_values: dict[str, list[float]]) -> str: + """Score trend arrows: ↗↘→ per score.""" + parts: list[str] = [] + for _name, values in sorted(score_values.items()): + if len(values) >= 2: + d = values[-1] - values[0] + if d > 0.01: + parts.append("[green]↗[/]") + elif d < -0.01: + parts.append("[red]↘[/]") + else: + parts.append("[dim]→[/]") + return "".join(parts) + + +# --------------------------------------------------------------------------- +# Data helpers +# --------------------------------------------------------------------------- + + +def _format_rate(rates: list[float]) -> str: + if len(rates) >= 2: + first, last = rates[0], rates[-1] + delta = last - first + arrow = "↗" if delta > 0.01 else "↘" if delta < -0.01 else "→" + return f"{first * 100:.0f}% {arrow} {last * 100:.0f}%" + if rates: + return f"{rates[0] * 100:.0f}%" + return "-" + + +def _aggregate_suites(entries: list[dict[str, Any]]) -> dict[str, dict[str, Any]]: + suites: dict[str, dict[str, Any]] = {} + for entry in entries: + for name, data in entry.get("suites", {}).items(): + if not isinstance(data, dict): + continue + if name not in suites: + suites[name] = { + "kind": data.get("kind", "test"), + "n_runs": 0, + "pass_rates": [], + "flaky": {}, + "cases_seen": {}, + "score_values": {}, + } + s = suites[name] + s["n_runs"] += 1 + total = data.get("total_cases", 0) + passed = data.get("passed", 0) + if total: + s["pass_rates"].append(passed / total) + _track_cases(s, data.get("cases", {})) + + for s in suites.values(): + s["flaky"] = { + cn: cs["fails"] + for cn, cs in s["cases_seen"].items() + if 0 < cs["fails"] < cs["runs"] + } + return suites + + +def _track_cases(suite: dict[str, Any], cases: dict[str, Any]) -> None: + """Track per-case pass/fail and scores for a suite.""" + for cn, cd in cases.items(): + if not isinstance(cd, dict): + continue + if cn not in suite["cases_seen"]: + suite["cases_seen"][cn] = {"runs": 0, "fails": 0} + suite["cases_seen"][cn]["runs"] += 1 + if not cd.get("passed", True): + suite["cases_seen"][cn]["fails"] += 1 + for sn, sv in cd.get("scores", {}).items(): + if isinstance(sv, (int, float)): + if sn not in suite["score_values"]: + suite["score_values"][sn] = [] + suite["score_values"][sn].append(float(sv)) + + +def _get_display_model(entry: dict[str, Any]) -> str: + """Get display model: per-suite models if they differ, global otherwise.""" + suite_models = { + sd.get("model") + for sd in entry.get("suites", {}).values() + if isinstance(sd, dict) and sd.get("model") + } + if len(suite_models) > 1: + return ", ".join(sorted(suite_models)) + if suite_models: + return next(iter(suite_models)) + return (entry.get("evals") or {}).get("model") or "-" + + +def _entry_stats(entry: dict[str, Any]) -> tuple[int, int, float]: + total = passed = 0 + for data in entry.get("suites", {}).values(): + if isinstance(data, dict): + total += data.get("total_cases", 0) + passed += data.get("passed", 0) + return passed, total, passed / total if total else 0 + + +def _all_cases(entry: dict[str, Any]) -> dict[str, Any]: + cases: dict[str, Any] = {} + for data in entry.get("suites", {}).values(): + if isinstance(data, dict): + cases.update(data.get("cases", {})) + return cases + + +def _classify_changes( + curr_cases: dict[str, Any], + prev_cases: dict[str, Any], +) -> dict[str, list[str]]: + result: dict[str, list[str]] = { + "fixed": [], + "regressed": [], + "modified": [], + "new": [], + } + for name, curr in curr_cases.items(): + prev = prev_cases.get(name) + if prev is None: + result["new"].append(name) + elif curr.get("case_hash") and curr["case_hash"] != prev.get("case_hash"): + result["modified"].append(f"{name} (case modified)") + elif curr.get("eval_hash") and curr["eval_hash"] != prev.get("eval_hash"): + result["modified"].append(f"{name} (scoring modified)") + elif curr.get("passed") and not prev.get("passed"): + result["fixed"].append(name) + elif not curr.get("passed") and prev.get("passed"): + result["regressed"].append(name) + return result + + +def _print_changes(changes: dict[str, list[str]]) -> None: + labels = { + "fixed": ("Fixed", "+"), + "regressed": ("Regressions", "-"), + "modified": ("Modified", "⟳"), + "new": ("New", "*"), + } + has_any = False + for key, (label, marker) in labels.items(): + if changes[key]: + has_any = True + print(f"\n {label} ({len(changes[key])}):") + for n in changes[key]: + print(f" {marker} {n}") + if not has_any: + print(" No changes.") + print() + + +def _fmt_dur(seconds: float) -> str: + if seconds < 1: + return f"{seconds * 1000:.0f}ms" + if seconds < 60: + return f"{seconds:.1f}s" + return f"{int(seconds // 60)}m{seconds % 60:.0f}s" diff --git a/protest/cli/main.py b/protest/cli/main.py index a913e7f..0ee6f2a 100644 --- a/protest/cli/main.py +++ b/protest/cli/main.py @@ -2,7 +2,7 @@ import argparse import sys -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any if TYPE_CHECKING: from protest.core.session import ProTestSession @@ -103,19 +103,21 @@ def main() -> None: _print_help() return - if command == "tags": - _handle_tags_command() + commands: dict[str, Any] = { + "tags": _handle_tags_command, + "run": lambda: _handle_run_command(kind_filter="test"), + "eval": lambda: _handle_run_command(kind_filter="eval"), + "history": _handle_history_command, + "live": _handle_live_command, + } + + handler = commands.get(command) + if handler: + handler() return - if command == "run": - _handle_run_command() - return - - if command == "live": - _handle_live_command() - return - - print(f"Error: Unknown command '{command}'. Use 'run', 'tags', or 'live'.") + valid = ", ".join(f"'{c}'" for c in commands) + print(f"Error: Unknown command '{command}'. Use {valid}.") sys.exit(1) @@ -143,9 +145,11 @@ def _print_help() -> None: """Print main help.""" print("ProTest - Async-first Python test framework\n") print("Commands:") - print(" run Run tests") - print(" live Start live reporter server") - print(" tags Tag inspection commands") + print(" run Run tests") + print(" eval Run evaluations") + print(" history Browse run history") + print(" live Start live reporter server") + print(" tags Tag inspection commands") print(HELP_EPILOG) @@ -228,8 +232,15 @@ def _create_run_parser() -> argparse.ArgumentParser: return parser -def _handle_run_command() -> None: - """Handle 'protest run' subcommand with two-phase parsing.""" +def _handle_history_command() -> None: + """Handle 'protest history' subcommand.""" + from protest.cli.history import handle_history_command + + handle_history_command(sys.argv[2:]) + + +def _handle_run_command(kind_filter: str | None = None) -> None: + """Handle 'protest run' / 'protest eval' with two-phase parsing.""" from protest.loader import LoadError, load_session, parse_target argv = sys.argv[2:] @@ -275,13 +286,14 @@ def _handle_run_command() -> None: from protest.reporting.verbosity import Verbosity effective_verbosity = Verbosity.QUIET if args.quiet else args.verbosity - ctx = PluginContext( - args={ - **vars(args), - "target_suite": suite_filter, - "verbosity": effective_verbosity, - } - ) + ctx_args: dict[str, Any] = { + **vars(args), + "target_suite": suite_filter, + "verbosity": effective_verbosity, + } + if kind_filter: + ctx_args["kind_filter"] = kind_filter + ctx = PluginContext(args=ctx_args) # Phase 6: Run tests (api.run_session handles plugin activation) run_tests(session, ctx, collect_only=args.collect_only) diff --git a/protest/console.py b/protest/console.py new file mode 100644 index 0000000..9270c16 --- /dev/null +++ b/protest/console.py @@ -0,0 +1,70 @@ +"""protest.console — progress output that bypasses test capture. + +Usage:: + + from protest import console + + @fixture() + async def pipeline(): + for i, scene in enumerate(scenes): + console.print(f"[bold]pipeline:[/] importing {scene.name} ({i+1}/{len(scenes)})") + await import_scene(scene) + + # Raw mode — no markup processing + console.print("debug: raw bytes here", raw=True) + +Messages go through the event bus → reporters display them inline. +If no event bus is available (outside a protest session), falls back to stderr. +""" + +from __future__ import annotations + +import re +import sys + + +def print(msg: str, *, raw: bool = False) -> None: + """Print a message that bypasses test capture. + + Goes through the event bus so reporters display it at the right place. + Supports Rich markup (stripped for ASCII reporter). + + Args: + msg: The message to print. Supports Rich markup unless raw=True. + raw: If True, no markup processing — message passed as-is. + """ + from protest.execution.capture import get_event_bus + + bus = get_event_bus() + if bus is None: + _fallback_print(msg, raw) + return + + from protest.events.types import Event + + # Call handlers directly (sync, bypasses async emit). + # This ensures messages appear immediately, not after the test. + for handler_entry in bus._handlers.get(Event.USER_PRINT, []): + try: + handler_entry.func((msg, raw)) + except Exception: + pass + + +def _fallback_print(msg: str, raw: bool) -> None: + """Fallback when no event bus — write to real stderr (bypassing capture).""" + text = msg if raw else strip_markup(msg) + # sys.stderr may be wrapped by TaskAwareStream — get the original + stream = getattr(sys.stderr, "_original", sys.stderr) + stream.write(text + "\n") + stream.flush() + + +def strip_markup(msg: str) -> str: + """Strip Rich markup tags from a string. + + Handles escaped brackets (``\\[text]`` → ``[text]``). + """ + msg = msg.replace("\\[", "\x00") + msg = re.sub(r"\[/?[^\]]*\]", "", msg) + return msg.replace("\x00", "[") diff --git a/protest/reporting/ascii.py b/protest/reporting/ascii.py index 9ff7211..a52c509 100644 --- a/protest/reporting/ascii.py +++ b/protest/reporting/ascii.py @@ -1,5 +1,6 @@ import traceback from pathlib import Path +from typing import Any from typing_extensions import Self @@ -123,7 +124,7 @@ def on_fixture_setup_start(self, info: FixtureInfo) -> None: print(f" -> fixture '{info.name}' setup... ({info.scope.value})") def on_fixture_setup_done(self, info: FixtureInfo) -> None: - if self._verbosity >= Verbosity.FIXTURES: + if self._verbosity >= Verbosity.NORMAL: print( f" -> fixture '{info.name}' ready ({_format_duration(info.duration)})" ) @@ -140,11 +141,19 @@ def on_fixture_teardown_done(self, info: FixtureInfo) -> None: def on_test_setup_done(self, info: TestStartInfo) -> None: if self._verbosity >= Verbosity.FIXTURES: - print(f" > {info.name} setup done") + self._print_bypass(f" > {info.name} setup done") def on_test_teardown_start(self, info: TestTeardownInfo) -> None: if self._verbosity >= Verbosity.FIXTURES: - print(f" < {info.name} teardown...") + self._print_bypass(f" < {info.name} teardown...") + + @staticmethod + def _print_bypass(msg: str) -> None: + import sys + + stream = getattr(sys.stdout, "_original", sys.stdout) + stream.write(msg + "\n") + stream.flush() def on_test_retry(self, info: TestRetryInfo) -> None: delay_msg = f", retrying in {info.delay}s" if info.delay > 0 else "" @@ -250,6 +259,39 @@ def _print_failure_detail(self, result: TestResult, *, is_error: bool) -> None: for line in result.output.rstrip().splitlines(): print(f" {line}") + def on_user_print(self, data: Any) -> None: + import sys + + from protest.console import strip_markup + + msg, raw = data + text = msg if raw else strip_markup(msg) + stream = getattr(sys.stdout, "_original", sys.stdout) + stream.write(f" | {text}\n") + stream.flush() + + def on_eval_suite_end(self, report: Any) -> None: + from protest.evals.types import EvalSuiteReport + + if not isinstance(report, EvalSuiteReport): + return + stats = report.all_score_stats() + print() + print(f" Eval: {report.suite_name} ({report.total_count} cases)") + if stats: + max_name = max(len(s.name) for s in stats) + print(" " + "─" * 60) + for s in stats: + print( + f" {s.name:<{max_name}} " + f"mean={s.mean:.2f} p50={s.median:.2f} " + f"p5={s.p5:.2f} p95={s.p95:.2f}" + ) + print(" " + "─" * 60) + rate_pct = report.pass_rate * 100 + print(f" Passed: {report.passed_count}/{report.total_count} ({rate_pct:.1f}%)") + print() + def on_session_complete(self, result: SessionResult) -> None: if self._failed_results or self._error_results: self._print_failure_summary() diff --git a/protest/reporting/rich_reporter.py b/protest/reporting/rich_reporter.py index 2931e6b..8f263d9 100644 --- a/protest/reporting/rich_reporter.py +++ b/protest/reporting/rich_reporter.py @@ -1,6 +1,7 @@ import traceback from argparse import ArgumentParser from pathlib import Path +from typing import Any from rich.console import Console # type: ignore[import-not-found] from typing_extensions import Self @@ -24,12 +25,17 @@ from protest.reporting.verbosity import Verbosity +def _short_label(name: str, node_id: str) -> str: + """name + [case_id] from node_id.""" + if "[" in node_id: + suffix = node_id[node_id.index("[") :] + return f"{name}{suffix}" + return name + + def _format_test_name(result: TestResult) -> str: - if "[" in result.node_id: - suffix = result.node_id[result.node_id.index("[") :] - escaped_suffix = suffix.replace("[", "\\[") - return f"{result.name}{escaped_suffix}" - return result.name + label = _short_label(result.name, result.node_id) + return label.replace("[", "\\[") MIN_DURATION_THRESHOLD = 0.001 @@ -43,15 +49,38 @@ def _format_duration(seconds: float) -> str: return f"{seconds:.2f}s" +def _format_eval_scores_inline(result: TestResult) -> str: + """Format eval scores for inline display (e.g. ' bg_score=0.8 char_id=1.0').""" + if not result.eval_payload: + return "" + parts = [] + for name, entry in result.eval_payload.scores.items(): + val = entry.value + if isinstance(val, bool): + parts.append(f"{name}={'✓' if val else '✗'}") + elif isinstance(val, float): + parts.append(f"{name}={val:.2f}") + else: + parts.append(f"{name}={val}") + return f" [dim]{' '.join(parts)}[/]" if parts else "" + + class RichReporter(PluginBase): """Rich console reporter with colors.""" name = "rich-reporter" description = "Rich console reporter with colors" - def __init__(self, verbosity: int = 0) -> None: + def __init__( + self, + verbosity: int = 0, + show_logs: str | None = None, + show_output: bool = False, + ) -> None: self.console = Console(highlight=False) self._verbosity = verbosity + self._show_logs = show_logs + self._show_output = show_output self._failed_results: list[TestResult] = [] self._error_results: list[TestResult] = [] @@ -71,16 +100,80 @@ def add_cli_options(cls, parser: ArgumentParser) -> None: action="store_true", help="Disable colors (plain ASCII output)", ) + group.add_argument( + "--show-logs", + dest="show_logs", + nargs="?", + const="INFO", + default=None, + metavar="LEVEL", + help="Show captured log records (default: INFO+)", + ) + group.add_argument( + "--show-output", + dest="show_output", + action="store_true", + help="Show eval inputs/output/expected per case", + ) @classmethod def activate(cls, ctx: PluginContext) -> Self | None: if ctx.get("no_color", False): return None - return cls(verbosity=ctx.get("verbosity", 0)) + return cls( + verbosity=ctx.get("verbosity", 0), + show_logs=ctx.get("show_logs"), + show_output=ctx.get("show_output", False), + ) def _print(self, message: str) -> None: self.console.print(message) + def _print_eval_detail(self, result: TestResult) -> None: + """Print eval inputs/output/expected for -vv verbosity.""" + p = result.eval_payload + if not p: + return + if p.inputs is not None: + inp = str(p.inputs)[:200] + self._print(f"[dim] │ inputs: {inp}[/]") + if p.output is not None: + out = str(p.output)[:200] + self._print(f"[dim] │ output: {out}[/]") + if p.expected_output is not None: + exp = str(p.expected_output)[:200] + self._print(f"[dim] │ expected: {exp}[/]") + + def _maybe_show_logs(self, result: TestResult) -> None: + """Show captured log records if --show-logs is active.""" + if not self._show_logs or not result.log_records: + return + import logging + + min_level = getattr(logging, self._show_logs.upper(), logging.INFO) + for record in result.log_records: + if record.levelno >= min_level: + level = record.levelname + color = ( + "red" + if record.levelno >= logging.ERROR + else "yellow" + if record.levelno >= logging.WARNING + else "dim" + ) + self._print( + f"[{color}] LOG [{level}] {record.name}: {record.getMessage()}[/]" + ) + + def _print_bypass(self, message: str) -> None: + """Print bypassing capture (for lifecycle messages emitted during tests).""" + import sys + + from rich.console import Console + + stream = getattr(sys.stdout, "_original", sys.stdout) + Console(file=stream, highlight=False).print(message) + def on_collection_finish(self, items: list[TestItem]) -> list[TestItem]: return items @@ -128,7 +221,7 @@ def on_fixture_setup_start(self, info: FixtureInfo) -> None: self._print(f"[dim] ↳ fixture '{info.name}' setup... {scope_str}[/]") def on_fixture_setup_done(self, info: FixtureInfo) -> None: - if self._verbosity >= Verbosity.FIXTURES: + if self._verbosity >= Verbosity.NORMAL: self._print( f"[dim] ↳ fixture '{info.name}' ready ({_format_duration(info.duration)})[/]" ) @@ -145,11 +238,13 @@ def on_fixture_teardown_done(self, info: FixtureInfo) -> None: def on_test_setup_done(self, info: TestStartInfo) -> None: if self._verbosity >= Verbosity.FIXTURES: - self._print(f"[dim] → {info.name} setup done[/]") + label = _short_label(info.name, info.node_id).replace("[", "\\[") + self._print_bypass(f"[dim] → {label} setup done[/]") def on_test_teardown_start(self, info: TestTeardownInfo) -> None: if self._verbosity >= Verbosity.FIXTURES: - self._print(f"[dim] ← {info.name} teardown...[/]") + label = _short_label(info.name, info.node_id).replace("[", "\\[") + self._print_bypass(f"[dim] ← {label} teardown...[/]") def on_test_retry(self, info: TestRetryInfo) -> None: delay_msg = f", retrying in {info.delay}s" if info.delay > 0 else "" @@ -169,7 +264,13 @@ def on_test_pass(self, result: TestResult) -> None: retry_suffix = ( f" [dim]\\[attempt {result.attempt}/{result.max_attempts}][/]" ) - self._print(f" [green]✓[/] {name} [dim]({duration})[/]{retry_suffix}") + scores_str = _format_eval_scores_inline(result) if result.is_eval else "" + self._print( + f" [green]✓[/] {name} [dim]({duration})[/]{scores_str}{retry_suffix}" + ) + if self._show_output and result.is_eval: + self._print_eval_detail(result) + self._maybe_show_logs(result) def on_test_fail(self, result: TestResult) -> None: name = _format_test_name(result) @@ -197,8 +298,17 @@ def on_test_fail(self, result: TestResult) -> None: self._print(f" [red]✗[/] {name}: {result.error}{retry_suffix}") if result.output: - for line in result.output.rstrip().splitlines(): + lines = result.output.rstrip().splitlines() + max_lines = 20 + for line in lines[:max_lines]: self._print(f"[dim] │ {line}[/]") + if len(lines) > max_lines: + self._print( + f"[dim] │ ... ({len(lines) - max_lines} more lines in .protest/last_run_stdout)[/]" + ) + if result.is_eval: + self._print_eval_detail(result) # always show on fail + self._maybe_show_logs(result) def on_test_skip(self, result: TestResult) -> None: self._skipped += 1 @@ -249,14 +359,16 @@ def _format_traceback(self, error: Exception) -> str: return "".join(lines) def _print_failure_summary(self) -> None: - if self._failed_results: + non_eval_failures = [r for r in self._failed_results if not r.is_eval] + if non_eval_failures: self._print("\n[bold red]═══ FAILURES ═══[/]") - for result in self._failed_results: + for result in non_eval_failures: self._print_failure_detail(result, is_error=False) - if self._error_results: + non_eval_errors = [r for r in self._error_results if not r.is_eval] + if non_eval_errors: self._print("\n[bold yellow]═══ ERRORS ═══[/]") - for result in self._error_results: + for result in non_eval_errors: self._print_failure_detail(result, is_error=True) def _print_failure_detail(self, result: TestResult, *, is_error: bool) -> None: @@ -281,8 +393,64 @@ def _print_failure_detail(self, result: TestResult, *, is_error: bool) -> None: escaped_line = line.replace("[", "\\[") self._print(f"[dim]{escaped_line}[/]") + def on_user_print(self, data: Any) -> None: + import sys + + from rich.console import Console + + msg, raw = data + # Write to the real stdout, bypassing capture + stream = getattr(sys.stdout, "_original", sys.stdout) + c = Console(file=stream, highlight=False) + if raw: + c.print(msg, markup=False) + else: + c.print(f"[dim] │[/] {msg}") + + def on_eval_suite_end(self, report: Any) -> None: + from rich.table import Table + + from protest.evals.types import EvalSuiteReport + + if not isinstance(report, EvalSuiteReport): + return + stats = report.all_score_stats() + self._print("") + if stats: + table = Table( + title=f"Eval: {report.suite_name} ({report.total_count} cases)", + show_header=True, + header_style="bold cyan", + padding=(0, 1), + ) + table.add_column("Score", style="cyan", no_wrap=True) + table.add_column("mean", justify="right") + table.add_column("p50", justify="right") + table.add_column("p5", justify="right", style="dim") + table.add_column("p95", justify="right", style="dim") + for s in stats: + table.add_row( + s.name, + f"{s.mean:.2f}", + f"{s.median:.2f}", + f"{s.p5:.2f}", + f"{s.p95:.2f}", + ) + self.console.print(table) + else: + self._print( + f" [cyan]Eval: {report.suite_name} ({report.total_count} cases)[/]" + ) + rate_pct = report.pass_rate * 100 + color = "green" if rate_pct >= 100 else "yellow" if rate_pct >= 50 else "red" + self._print( + f" [{color}]Passed: {report.passed_count}/{report.total_count} ({rate_pct:.1f}%)[/]" + ) + def on_session_complete(self, result: SessionResult) -> None: - if self._failed_results or self._error_results: + has_non_eval_failures = any(not r.is_eval for r in self._failed_results) + has_non_eval_errors = any(not r.is_eval for r in self._error_results) + if has_non_eval_failures or has_non_eval_errors: self._print_failure_summary() total = ( From 82f736b304fa3a48bf21b051017d022827dc569f Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Sun, 29 Mar 2026 20:00:00 +0200 Subject: [PATCH 04/60] feat: tests, examples, and documentation - 1063 tests (56 eval-specific) - Yorkshire chatbot example with @session.eval + ForEach - History module: JSONL storage, git info, env info - docs/evals.md: full guide (scoring, evaluators, CLI, history) - docs/core-concepts/console.md: console.print guide --- docs/core-concepts/console.md | 49 ++ docs/evals.md | 356 ++++++++ examples/yorkshire/app/chatbot.py | 93 +++ examples/yorkshire/evals/__init__.py | 0 examples/yorkshire/evals/dataset.py | 122 +++ examples/yorkshire/evals/evaluators.py | 5 + examples/yorkshire/evals/session.py | 29 + examples/yorkshire/session.py | 52 ++ mkdocs.yml | 2 + protest/history/__init__.py | 17 + protest/history/collector.py | 81 ++ protest/history/plugin.py | 98 +++ protest/history/storage.py | 135 +++ tests/core/test_collector.py | 6 +- tests/core/test_parametrize.py | 2 +- tests/core/test_skip.py | 2 +- tests/core/test_skipif.py | 2 +- tests/core/test_xfail.py | 2 +- tests/evals/test_e2e.py | 1064 ++++++++++++++++++++++++ tests/evals/test_hashing.py | 72 ++ 20 files changed, 2182 insertions(+), 7 deletions(-) create mode 100644 docs/core-concepts/console.md create mode 100644 docs/evals.md create mode 100644 examples/yorkshire/app/chatbot.py create mode 100644 examples/yorkshire/evals/__init__.py create mode 100644 examples/yorkshire/evals/dataset.py create mode 100644 examples/yorkshire/evals/evaluators.py create mode 100644 examples/yorkshire/evals/session.py create mode 100644 examples/yorkshire/session.py create mode 100644 protest/history/__init__.py create mode 100644 protest/history/collector.py create mode 100644 protest/history/plugin.py create mode 100644 protest/history/storage.py create mode 100644 tests/evals/test_e2e.py create mode 100644 tests/evals/test_hashing.py diff --git a/docs/core-concepts/console.md b/docs/core-concepts/console.md new file mode 100644 index 0000000..b172246 --- /dev/null +++ b/docs/core-concepts/console.md @@ -0,0 +1,49 @@ +# Console Output + +Print progress and debug messages that bypass test capture. + +## The Problem + +`print()` inside tests and fixtures is captured by ProTest. During long-running fixtures (pipeline imports, graph seeding), there's no visible feedback. + +## `console.print` + +```python +from protest import console + +@fixture() +async def pipeline(): + for i, scene in enumerate(scenes): + console.print(f"[cyan]pipeline:[/] importing {scene.name} ({i+1}/{len(scenes)})") + await import_scene(scene) + return driver +``` + +Messages appear inline in the reporter output, between test results. + +## Rich Markup + +`console.print` supports Rich markup. The Rich reporter renders colors; the ASCII reporter strips tags. + +```python +console.print(f"[bold green]done[/] in {duration:.1f}s") +console.print(f"[yellow]warning:[/] slow query ({elapsed:.2f}s)") +``` + +## Raw Mode + +Skip markup processing with `raw=True`: + +```python +console.print("debug: raw bytes here", raw=True) +``` + +The message is passed as-is to both reporters. + +## How It Works + +`console.print` sends a `USER_PRINT` event through the event bus. The reporter receives it and writes to the real stdout (bypassing test capture). This means: + +- Messages appear immediately, not buffered until test end +- Works with `-n 4` (concurrent tests) — the event bus serializes per plugin +- No interference with test capture or `result.output` diff --git a/docs/evals.md b/docs/evals.md new file mode 100644 index 0000000..11895aa --- /dev/null +++ b/docs/evals.md @@ -0,0 +1,356 @@ +# Evals + +Evaluate LLM outputs with scored metrics, thresholds, and historical tracking. + +## What is an Eval? + +A test produces **pass/fail**. An eval produces **scores** — numeric values (0.0–1.0) that measure output quality. Scores are aggregated across cases, tracked over time, and compared between runs. + +ProTest evals use the same infrastructure as tests: fixtures, DI, parallelism, tags. An eval is a test that returns a value, scored by evaluators. + +## Quick Start + +```python +# evals/session.py +from typing import Annotated + +from protest import ForEach, From +from protest.evals import EvalCase, EvalSession, evaluator +from protest.evals.evaluators import contains_keywords + +cases = ForEach([ + EvalCase(inputs="Who is Marie?", expected="Marie, Resistance", name="lookup"), + EvalCase(inputs="What is 2+2?", expected="4", name="math"), +]) + +session = EvalSession() + +@session.eval(evaluators=[contains_keywords(keywords=["Marie"])]) +async def chatbot(case: Annotated[EvalCase, From(cases)]) -> str: + return await my_agent(case.inputs) +``` + +```bash +protest eval evals.session:session +``` + +## How It Works + +`@session.eval()` wraps a function to run evaluators on its return value: + +1. Your function receives case data via `ForEach`/`From` (same as parameterized tests) +2. It returns the output (string, object, anything) +3. ProTest passes the output to evaluators → scores +4. Scores determine pass/fail via thresholds +5. Aggregated stats appear in the terminal + +The rest of the pipeline — fixtures, DI, parallelism, reporters — works identically to tests. + +## EvalSession + +`EvalSession` is a session configured for evals. History is enabled by default. + +```python +from protest.evals import EvalSession, ModelInfo + +session = EvalSession( + model=ModelInfo(name="gpt-4o-mini"), # tracked in history + concurrency=4, # parallel eval cases + metadata={"version": "1.0"}, # stored in history +) +``` + +## EvalCase + +Typed dataclass for eval case data. Provides IDE autocompletion instead of untyped dicts. + +```python +from protest.evals import EvalCase + +cases = ForEach([ + EvalCase(inputs="What is 2+2?", expected="4", name="math"), + EvalCase(inputs="Who is Napoleon?", expected="emperor, France", name="history"), +]) +``` + +| Field | Type | Description | +|-------|------|-------------| +| `inputs` | `Any` | Input to your task function | +| `expected` | `Any` | Expected output (passed to evaluators as `ctx.expected_output`) | +| `name` | `str` | Case identifier (used in test IDs and history) | +| `evaluators` | `list` | Per-case evaluators (added to suite-level ones) | +| `metadata` | `dict` | Arbitrary metadata | + +## Evaluators + +An evaluator is a function decorated with `@evaluator` that receives an `EvalContext` and returns a verdict. + +### Return Types + +Evaluators return `bool` (simple verdict) or a `dataclass` (structured result). The framework reads fields by type: + +| Field Type | Role | +|------------|------| +| `bool` | Verdict — pass/fail (`all(bool_fields)`) | +| `float` | Metric — aggregated in stats (mean/p50/p95) | +| `str` | Reason — displayed on failure, stored in history | + +Returning `float`, `dict`, or any other type raises `TypeError`. + +### Simple Evaluator + +```python +@evaluator +def not_empty(ctx: EvalContext) -> bool: + return bool(ctx.output.strip()) +``` + +### Structured Evaluator + +```python +from dataclasses import dataclass + +@dataclass +class KeywordScores: + keyword_recall: float # metric → stats + all_present: bool # verdict → pass/fail + detail: str = "" # reason → shown on failure + +@evaluator +def keyword_check(ctx: EvalContext, keywords: list[str], min_recall: float = 0.5) -> KeywordScores: + found = [k for k in keywords if k.lower() in ctx.output.lower()] + recall = len(found) / len(keywords) + return KeywordScores( + keyword_recall=recall, + all_present=recall >= min_recall, + detail=f"found {len(found)}/{len(keywords)}", + ) +``` + +The threshold (`min_recall`) is a parameter of the evaluator, not a framework concept. The evaluator decides the verdict. + +### Async (LLM Judge) + +```python +@dataclass +class JudgeResult: + accuracy: float + accurate_enough: bool + reason: str = "" + +@evaluator +async def llm_judge(ctx: EvalContext, rubric: str = "", min_score: float = 0.7) -> JudgeResult: + result = await judge_agent.run(f"Evaluate: {ctx.output}\nCriteria: {rubric}") + score = parse_score(result) + return JudgeResult(accuracy=score, accurate_enough=score >= min_score, reason=result.explanation) +``` + +### Per-Case Thresholds + +Different thresholds per case = different evaluator bindings: + +```python +EvalCase(inputs="easy lookup", evaluators=[keyword_check(keywords=["paris"], min_recall=0.9)]), +EvalCase(inputs="hard causal", evaluators=[keyword_check(keywords=["paris"], min_recall=0.3)]), +``` + +### Using Evaluators + +```python +# No params → use directly +evaluators=[not_empty] + +# With params → call to bind +evaluators=[keyword_check(keywords=["python", "async"], min_recall=0.75)] + +# Per-case evaluators (added to suite-level) +EvalCase(inputs="...", evaluators=[llm_judge(rubric="Check factual accuracy")]) +``` + +### EvalContext + +| Field | Type | Description | +|-------|------|-------------| +| `name` | `str` | Case name | +| `inputs` | `I` | Case inputs | +| `output` | `O` | Task return value | +| `expected_output` | `O \| None` | From `EvalCase.expected` | +| `metadata` | `Any` | From `EvalCase.metadata` | +| `duration` | `float` | Task execution time (seconds) | + +### Built-in Evaluators + +| Evaluator | Params | Returns | +|-----------|--------|---------| +| `contains_keywords` | `keywords, min_recall=0.0` | `keyword_recall: float`, `all_keywords_present: bool` | +| `contains_expected` | `case_sensitive=False` | `bool` | +| `does_not_contain` | `forbidden` | `no_forbidden_words: bool` | +| `not_empty` | — | `bool` | +| `max_length` | `max_chars=500` | `conciseness: float`, `within_limit: bool` | +| `min_length` | `min_chars=1` | `bool` | +| `matches_regex` | `pattern` | `bool` | +| `json_valid` | `required_keys=[]` | `valid_json: bool`, `has_required_keys: bool` | +| `word_overlap` | — | `overlap: float` (tracking-only) | + +## Fixtures + +Evals use the same fixture system as tests. Expensive setup (database, pipeline, graph) runs once and is shared across all cases. + +```python +@fixture() +async def pipeline(): + driver = await build_pipeline() # 3 minutes, once + yield driver + await driver.close() + +session.bind(pipeline) + +@session.eval(evaluators=[my_scorer]) +async def pipeline_eval( + case: Annotated[EvalCase, From(cases)], + driver: Annotated[AsyncDriver, Use(pipeline)], +) -> QueryResult: + return await query(driver, case.inputs) +``` + +## ModelInfo + +`ModelInfo` is a **label for history tracking** — it does not configure or route to any model. It records which model produced the results so you can compare runs. + +```python +session = EvalSession(model=ModelInfo(name="qwen-2.5")) +``` + +## Evaluator Errors + +If an evaluator raises an exception (e.g. LLM judge timeout), the case is marked as **error** (not fail). The stack trace appears in the output. Scores from other evaluators that ran before the error are lost. + +> **Tip:** For non-deterministic evaluators (LLM judges), catch exceptions in the evaluator and return a score indicating failure rather than letting them propagate. + +## Multi-Model Sessions + +Track which model produced each eval suite's results: + +```python +pipeline_model = ModelInfo(name="qwen-2.5") +chat_model = ModelInfo(name="mistral-7b") + +session = EvalSession(model=pipeline_model) + +@session.eval(evaluators=[...], name="pipeline", model=pipeline_model) +async def pipeline_eval(case, driver) -> str: ... + +@session.eval(evaluators=[...], name="chatbot", model=chat_model) +async def chatbot_eval(case, deps) -> str: ... +``` + +`protest history --runs` shows the model per suite: + +``` +#1 2026-03-28T09:14 57/81 (70%) cb6f7bc + pipeline 29/39 (74%) qwen-2.5 + chatbot 10/21 (48%) mistral-7b +``` + +## CLI + +```bash +# Run evals +protest eval evals.session:session + +# Parallelism +protest eval evals.session:session -n 4 + +# Filter by tag +protest eval evals.session:session --tag chatbot + +# Filter by name +protest eval evals.session:session -k "lookup" + +# Re-run failures only +protest eval evals.session:session --last-failed + +# Verbosity: scores inline +protest eval evals.session:session -v + +# Show eval inputs/output/expected on passing cases +protest eval evals.session:session --show-output + +# Show captured log records +protest eval evals.session:session --show-logs +protest eval evals.session:session --show-logs=DEBUG +``` + +Flags are independent and combinable: `-v --show-output --show-logs`. + +> **Note:** Failed eval cases always show inputs/output/expected — no flag needed. + +## Output + +### Default + +``` + ✓ chatbot[lookup] (3.39s) facts_score=1.00 facts_ok=✓ + ✗ chatbot[causal]: facts_ok=False, LLMJudge=False + + Eval: chatbot (26 cases) +┏━━━━━━━━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┓ +┃ Score ┃ mean ┃ p50 ┃ p5 ┃ p95 ┃ +┡━━━━━━━━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━┩ +│ facts_score │ 0.37 │ 0.00 │ 0.00 │ 1.00 │ +└─────────────┴──────┴──────┴──────┴──────┘ + Passed: 14/26 (53.8%) + Results: .protest/results/chatbot_20260329_091422 +``` + +### Per-Case Results + +Each eval case writes a markdown file to `.protest/results/_/`: + +``` +.protest/results/chatbot_20260329_091422/ +├── lookup.md +├── causal.md +└── negative.md +``` + +## History + +Eval results are persisted as JSONL in `.protest/history.jsonl`. Track trends across runs. + +```bash +# Run list with per-suite breakdown +protest history --evals --runs + +# Detailed view of latest run +protest history --evals --show + +# Compare last two runs (fixed/regressed/new) +protest history --evals --compare +``` + +### Integrity Hashes + +Each case in history carries two hashes: + +- **`case_hash`** — hash of inputs + expected output. Changes when the test data changes. +- **`eval_hash`** — hash of evaluators + thresholds. Changes when the scoring criteria change. + +`protest history --compare` uses these hashes to detect modified cases vs regressions. If a case's `eval_hash` changed between runs, it's reported as "scoring modified" rather than a real regression. + +## Progress Output + +For long-running fixtures, use `console.print` to show progress without polluting test capture: + +```python +from protest import console + +@fixture() +async def pipeline(): + for i, scene in enumerate(scenes): + console.print(f"[cyan]pipeline:[/] importing {scene.name} ({i+1}/{len(scenes)})") + await import_scene(scene) + return driver +``` + +Messages appear inline in the reporter output. Rich markup is supported (stripped for ASCII). diff --git a/examples/yorkshire/app/chatbot.py b/examples/yorkshire/app/chatbot.py new file mode 100644 index 0000000..dedc1e4 --- /dev/null +++ b/examples/yorkshire/app/chatbot.py @@ -0,0 +1,93 @@ +"""Yorkshire Terrier Expert Chatbot — fake LLM for eval demos. + +Simulates a RAG chatbot with realistic imperfections: +- Sometimes misses keywords (simulates retrieval failures) +- Occasionally adds irrelevant info (simulates hallucination) +- Response quality varies (simulates LLM non-determinism) +""" + +from __future__ import annotations + +import random + +# Knowledge base — what a real RAG system would retrieve +YORKSHIRE_FACTS = { + "size": "Yorkshire Terriers typically weigh between 2-3 kg. They come in teacup, mini, and standard sizes.", + "grooming": "Yorkies with long coats need daily brushing. Seniors over 6 years need extra grooming care. Regular baths every 2-3 weeks.", + "temperament": "Yorkies are bold, confident, and affectionate. Despite their small size, they are courageous and sometimes stubborn.", + "health": "Common health issues include dental problems, patellar luxation, and tracheal collapse. Regular vet checkups recommended.", + "training": "Yorkies are intelligent but can be stubborn. Positive reinforcement works best. Start training early for best results.", + "diet": "Small breed formula recommended. Feed 2-3 small meals per day. Avoid chocolate, grapes, and onions.", + "exercise": "30 minutes of daily exercise is sufficient. Short walks and indoor play. Avoid extreme temperatures.", + "jobs": "Historically bred as ratters. Modern Yorkies excel as therapy dogs, influencers, and loyal companions.", + "puppies": "Yorkshire puppies need extra care until 12 months. Socialization is critical in the first 6 months.", + "seniors": "Senior Yorkies (8+ years) may slow down. Adjust exercise and diet. More frequent vet visits recommended.", +} + + +def yorkshire_chatbot(question: str) -> str: + """Fake chatbot that answers questions about Yorkshire Terriers. + + Simulates a RAG pipeline: keyword matching → fact retrieval → response generation. + No LLM calls — pure string matching for deterministic eval testing. + """ + question_lower = question.lower() + + # Find relevant facts by keyword matching + relevant_facts: list[str] = [] + for topic, fact in YORKSHIRE_FACTS.items(): + if topic in question_lower or any( + word in question_lower for word in topic.split() + ): + relevant_facts.append(fact) + + # Check for specific question patterns + if "weight" in question_lower or "how heavy" in question_lower: + relevant_facts.append(YORKSHIRE_FACTS["size"]) + if "brush" in question_lower or "coat" in question_lower: + relevant_facts.append(YORKSHIRE_FACTS["grooming"]) + if "eat" in question_lower or "food" in question_lower or "feed" in question_lower: + relevant_facts.append(YORKSHIRE_FACTS["diet"]) + if "walk" in question_lower or "active" in question_lower: + relevant_facts.append(YORKSHIRE_FACTS["exercise"]) + if "old" in question_lower or "aging" in question_lower: + relevant_facts.append(YORKSHIRE_FACTS["seniors"]) + if ( + "puppy" in question_lower + or "baby" in question_lower + or "young" in question_lower + ): + relevant_facts.append(YORKSHIRE_FACTS["puppies"]) + + # Deduplicate while preserving order + seen: set[str] = set() + unique_facts = [] + for fact in relevant_facts: + if fact not in seen: + seen.add(fact) + unique_facts.append(fact) + + if not unique_facts: + return "I'm not sure about that. I specialize in Yorkshire Terrier care and health." + + response = " ".join(unique_facts) + + # Simulate LLM imperfections + # ~20% chance: drop a sentence (simulates retrieval miss) + if random.random() < 0.2 and ". " in response: # noqa: S311, PLR2004 + sentences = response.split(". ") + drop_idx = random.randint(0, len(sentences) - 1) # noqa: S311 + sentences.pop(drop_idx) + response = ". ".join(sentences) + + # ~10% chance: add irrelevant filler (simulates rambling) + if random.random() < 0.1: # noqa: S311, PLR2004 + response += " By the way, Yorkshire Terriers were originally bred in Yorkshire, England during the 19th century." + + # ~5% chance: return a vague non-answer (simulates confusion) + if random.random() < 0.05: # noqa: S311, PLR2004 + response = ( + "That's a great question about Yorkies! There are many factors to consider." + ) + + return response diff --git a/examples/yorkshire/evals/__init__.py b/examples/yorkshire/evals/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/examples/yorkshire/evals/dataset.py b/examples/yorkshire/evals/dataset.py new file mode 100644 index 0000000..7153ab6 --- /dev/null +++ b/examples/yorkshire/evals/dataset.py @@ -0,0 +1,122 @@ +"""Dataset for the Yorkshire chatbot evals.""" + +from __future__ import annotations + +from protest import ForEach +from protest.evals.evaluators import ( + contains_keywords, + does_not_contain, + max_length, + not_empty, +) + +yorkshire_cases = ForEach( + [ + # --- Factual recall --- + { + "name": "weight_question", + "inputs": "How much does a Yorkshire Terrier weigh?", + "expected": "2-3 kg", + "metadata": {"tags": ["factual", "size"]}, + "evaluators": [ + contains_keywords(keywords=["2-3 kg", "teacup", "mini", "standard"]) + ], + }, + { + "name": "grooming_basics", + "inputs": "How often should I brush my Yorkie?", + "expected": "daily brushing for long coats", + "metadata": {"tags": ["factual", "grooming"]}, + "evaluators": [contains_keywords(keywords=["daily", "brushing", "long"])], + }, + { + "name": "diet_advice", + "inputs": "What should I feed my Yorkshire Terrier?", + "expected": "small breed formula, 2-3 meals", + "metadata": {"tags": ["factual", "diet"]}, + "evaluators": [ + contains_keywords(keywords=["small breed", "meals", "avoid"]) + ], + }, + { + "name": "exercise_needs", + "inputs": "How much exercise does a Yorkie need?", + "expected": "30 minutes daily", + "metadata": {"tags": ["factual", "exercise"]}, + "evaluators": [contains_keywords(keywords=["30 minutes", "walk"])], + }, + # --- Temperament --- + { + "name": "personality", + "inputs": "What is the temperament of a Yorkshire Terrier?", + "expected": "bold, confident, affectionate", + "metadata": {"tags": ["factual", "temperament"]}, + "evaluators": [ + contains_keywords(keywords=["bold", "confident", "affectionate"]) + ], + }, + # --- Age-specific --- + { + "name": "puppy_care", + "inputs": "How do I care for a Yorkshire puppy?", + "expected": "extra care, socialization", + "metadata": {"tags": ["factual", "puppies"]}, + "evaluators": [contains_keywords(keywords=["12 months", "socialization"])], + }, + { + "name": "senior_care", + "inputs": "My Yorkie is getting old, what should I change?", + "expected": "adjust exercise, more vet visits", + "metadata": {"tags": ["factual", "seniors"]}, + "evaluators": [contains_keywords(keywords=["senior", "exercise", "vet"])], + }, + # --- Hallucination checks --- + { + "name": "no_cat_advice", + "inputs": "Tell me about Yorkshire Terrier health", + "expected": "dental problems, patellar luxation", + "metadata": {"tags": ["safety"]}, + "evaluators": [ + does_not_contain(forbidden=["cat", "feline", "persian"]), + contains_keywords(keywords=["dental", "health"]), + ], + }, + { + "name": "no_made_up_breeds", + "inputs": "What jobs can a Yorkie do?", + "expected": "therapy dogs, companions", + "metadata": {"tags": ["safety"]}, + "evaluators": [ + does_not_contain(forbidden=["labrador", "golden retriever", "poodle"]), + contains_keywords(keywords=["therapy", "companion"]), + ], + }, + # --- Edge cases --- + { + "name": "unknown_topic", + "inputs": "What is the GDP of France?", + "expected": "I'm not sure", + "metadata": {"tags": ["edge_case"]}, + "evaluators": [contains_keywords(keywords=["not sure", "specialize"])], + }, + { + "name": "empty_question", + "inputs": "", + "expected": "I'm not sure", + "metadata": {"tags": ["edge_case"]}, + "evaluators": [contains_keywords(keywords=["not sure"])], + }, + # --- Known weak spot (chatbot doesn't know about training treats) --- + { + "name": "training_treats", + "inputs": "What treats are best for training a Yorkie?", + "expected": "small soft treats, positive reinforcement", + "metadata": {"tags": ["factual", "training"]}, + "evaluators": [ + contains_keywords(keywords=["treats", "small", "soft", "reward"]) + ], + }, + ] +) + +suite_evaluators = [not_empty, max_length(max_chars=500)] diff --git a/examples/yorkshire/evals/evaluators.py b/examples/yorkshire/evals/evaluators.py new file mode 100644 index 0000000..b07153d --- /dev/null +++ b/examples/yorkshire/evals/evaluators.py @@ -0,0 +1,5 @@ +"""Yorkshire-specific evaluators. + +Generic evaluators come from protest.evals.evaluators. +Only project-specific ones live here. +""" diff --git a/examples/yorkshire/evals/session.py b/examples/yorkshire/evals/session.py new file mode 100644 index 0000000..7779f66 --- /dev/null +++ b/examples/yorkshire/evals/session.py @@ -0,0 +1,29 @@ +"""Yorkshire Chatbot Evals — evaluate the fake Yorkshire expert chatbot. + +Run with: + protest eval examples.yorkshire.evals.session:session + protest eval examples.yorkshire.evals.session:session -n 4 + protest eval examples.yorkshire.evals.session:session --tag safety + protest eval examples.yorkshire.evals.session:session --last-failed + protest history --evals --show +""" + +from typing import Annotated + +from examples.yorkshire.app.chatbot import yorkshire_chatbot +from examples.yorkshire.evals.dataset import ( + suite_evaluators, + yorkshire_cases, +) +from protest import From +from protest.evals import EvalSession, ModelInfo + +session = EvalSession( + model=ModelInfo(name="yorkshire-chatbot-v1", provider="local"), + metadata={"version": "1.0", "type": "keyword-matching"}, +) + + +@session.eval(evaluators=suite_evaluators) +def yorkshire_eval(case: Annotated[dict, From(yorkshire_cases)]) -> str: + return yorkshire_chatbot(case["inputs"]) diff --git a/examples/yorkshire/session.py b/examples/yorkshire/session.py new file mode 100644 index 0000000..7b8c3c3 --- /dev/null +++ b/examples/yorkshire/session.py @@ -0,0 +1,52 @@ +"""Yorkshire Terrier Unified Session — tests + evals in one session. + +Run all (tests + evals): + protest run examples.yorkshire.session:session + +Run only tests: + protest run examples.yorkshire.session:session + (protest run filters to kind=test by default) + +Run only evals: + protest eval examples.yorkshire.session:session +""" + +from examples.yorkshire.app.chatbot import yorkshire_chatbot +from examples.yorkshire.evals.dataset import dataset +from examples.yorkshire.tests.fixtures import ( + configure_kennel_logging, + kennel, + yorkshire, +) +from examples.yorkshire.tests.plugins import BarkPlugin +from examples.yorkshire.tests.suites.adults import adults_suite +from examples.yorkshire.tests.suites.custom_factory import custom_factory_suite +from examples.yorkshire.tests.suites.legacy.suite import legacy_suite +from examples.yorkshire.tests.suites.puppies.suite import puppies_suite +from examples.yorkshire.tests.suites.rate_limited import rate_limited_suite +from examples.yorkshire.tests.suites.seniors.suite import seniors_suite +from examples.yorkshire.tests.suites.showcase.suite import showcase_suite +from protest import ProTestSession +from protest.evals import ModelInfo + +session = ProTestSession(concurrency=4, history=True) +session.use(BarkPlugin) +session.bind(configure_kennel_logging, autouse=True) +session.bind(kennel) +session.bind(yorkshire) + +# Tests +session.add_suite(puppies_suite) +session.add_suite(adults_suite) +session.add_suite(seniors_suite) +session.add_suite(legacy_suite) +session.add_suite(showcase_suite) +session.add_suite(rate_limited_suite) +session.add_suite(custom_factory_suite) + +# Evals +session.configure_evals(model=ModelInfo(name="yorkshire-chatbot-v1", provider="local")) +session.register_dataset( + dataset, + task=yorkshire_chatbot, +) diff --git a/mkdocs.yml b/mkdocs.yml index 93864db..a643afe 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -65,6 +65,8 @@ nav: - Tags: core-concepts/tags.md - Dependency Injection: core-concepts/dependency-injection.md - Reporters: core-concepts/reporters.md + - Console Output: core-concepts/console.md + - Evals: evals.md - Guides: - Best Practices: best-practices.md - Project Organization: guides/project-organization.md diff --git a/protest/history/__init__.py b/protest/history/__init__.py new file mode 100644 index 0000000..5183cf7 --- /dev/null +++ b/protest/history/__init__.py @@ -0,0 +1,17 @@ +"""History module — run tracking for tests and evals.""" + +from protest.history.storage import ( + HISTORY_FILE, + append_entry, + clean_dirty, + load_history, + load_previous_run, +) + +__all__ = [ + "HISTORY_FILE", + "append_entry", + "clean_dirty", + "load_history", + "load_previous_run", +] diff --git a/protest/history/collector.py b/protest/history/collector.py new file mode 100644 index 0000000..e81eefd --- /dev/null +++ b/protest/history/collector.py @@ -0,0 +1,81 @@ +"""Metadata collection: git info, environment, CI detection.""" + +from __future__ import annotations + +import os +import platform +import subprocess +import sys +from typing import Any + + +def collect_git_info() -> dict[str, Any] | None: + """Collect git context. Returns None if not in a git repo.""" + try: + commit = _git("rev-parse", "HEAD") + return { + "commit": commit, + "commit_short": commit[:7] if commit else None, + "branch": _git("rev-parse", "--abbrev-ref", "HEAD"), + "dirty": bool(_git("status", "--porcelain")), + "author": _git("log", "-1", "--format=%an"), + "commit_message": _git("log", "-1", "--format=%s"), + } + except (FileNotFoundError, subprocess.CalledProcessError): + return None + + +def collect_env_info() -> dict[str, Any]: + """Collect environment metadata.""" + ci_provider = detect_ci_provider() + return { + "python_version": platform.python_version(), + "protest_version": _get_pkg_version("protest"), + "pydantic_evals_version": _get_pkg_version("pydantic-evals"), + "hostname": platform.node(), + "os": sys.platform, + "ci": ci_provider is not None, + "ci_provider": ci_provider, + } + + +_CI_PROVIDERS: dict[str, str] = { + "GITHUB_ACTIONS": "github-actions", + "GITLAB_CI": "gitlab-ci", + "CIRCLECI": "circleci", + "BUILDKITE": "buildkite", + "TRAVIS": "travis-ci", +} + + +def detect_ci_provider() -> str | None: + """Detect CI provider from standard environment variables.""" + env = os.environ + for var, name in _CI_PROVIDERS.items(): + if env.get(var) == "true": + return name + if env.get("JENKINS_URL"): + return "jenkins" + if env.get("CI") == "true": + return "unknown" + return None + + +def _git(*args: str) -> str: + result = subprocess.run( + ["git", *args], # noqa: S607 + capture_output=True, + text=True, + timeout=5, + check=True, + ) + return result.stdout.strip() + + +def _get_pkg_version(name: str) -> str | None: + try: + from importlib.metadata import version + + return version(name) + except Exception: + return None diff --git a/protest/history/plugin.py b/protest/history/plugin.py new file mode 100644 index 0000000..4fe80f6 --- /dev/null +++ b/protest/history/plugin.py @@ -0,0 +1,98 @@ +"""HistoryPlugin — persists test run results as JSONL.""" + +from __future__ import annotations + +import uuid +from datetime import datetime, timezone +from typing import TYPE_CHECKING, Any + +from protest.history.collector import collect_env_info, collect_git_info +from protest.history.storage import DEFAULT_HISTORY_DIR, HISTORY_FILE, append_entry +from protest.plugin import PluginBase + +if TYPE_CHECKING: + from pathlib import Path + + from protest.entities.events import SessionResult, TestResult + from protest.plugin import PluginContext + + +class HistoryPlugin(PluginBase): + """Persists test results to JSONL for run-over-run tracking.""" + + name = "history" + description = "Test history tracking" + + def __init__(self, history_dir: Path | None = None) -> None: + self._history_dir = history_dir or DEFAULT_HISTORY_DIR + self._history_file = self._history_dir / HISTORY_FILE + self._suites: dict[str, dict[str, dict[str, Any]]] = {} + self._suite_kinds: dict[str, str] = {} + self._default_suite_name: str = "tests" + self._history_enabled: bool = False + self._metadata: dict[str, Any] = {} + + @classmethod + def activate(cls, ctx: PluginContext) -> HistoryPlugin | None: + return None # Wired explicitly by session + + def setup(self, session: Any) -> None: + self._history_enabled = getattr(session, "history", False) + self._metadata = dict(getattr(session, "metadata", None) or {}) + for suite in getattr(session, "suites", []): + self._suite_kinds[suite.name] = getattr(suite, "kind", "test") + if not self._default_suite_name or self._default_suite_name == "tests": + self._default_suite_name = suite.name + + def on_test_pass(self, result: TestResult) -> None: + if result.is_eval: + return + self._record(result, passed=True) + + def on_test_fail(self, result: TestResult) -> None: + if result.is_eval: + return + self._record(result, passed=False) + + def on_session_end(self, _result: SessionResult) -> None: + if not self._history_enabled or not self._suites: + return + + suites_data: dict[str, Any] = {} + for suite_name, cases in self._suites.items(): + total = len(cases) + passed = sum(1 for c in cases.values() if c["passed"]) + suites_data[suite_name] = { + "kind": self._suite_kinds.get(suite_name, "test"), + "total_cases": total, + "passed": passed, + "failed": total - passed, + "pass_rate": round(passed / total, 4) if total else 0, + "duration": round(sum(c["duration"] for c in cases.values()), 2), + "cases": cases, + } + + entry: dict[str, Any] = { + "run_id": str(uuid.uuid4()), + "timestamp": datetime.now(tz=timezone.utc).isoformat(), + "git": collect_git_info(), + "environment": collect_env_info(), + "metadata": self._metadata, + "evals": None, + "suites": suites_data, + } + append_entry(self._history_file, entry) + + def _record(self, result: TestResult, *, passed: bool) -> None: + suite_name = self._get_suite_name(result) + if suite_name not in self._suites: + self._suites[suite_name] = {} + self._suites[suite_name][result.name] = { + "passed": passed, + "duration": round(result.duration, 3), + } + + def _get_suite_name(self, result: TestResult) -> str: + if result.suite_path: + return result.suite_path.root_name + return self._default_suite_name diff --git a/protest/history/storage.py b/protest/history/storage.py new file mode 100644 index 0000000..78d35b9 --- /dev/null +++ b/protest/history/storage.py @@ -0,0 +1,135 @@ +"""JSONL history storage: load, append, filter, clean.""" + +from __future__ import annotations + +import json +import subprocess +from pathlib import Path +from typing import Any + +DEFAULT_HISTORY_DIR = Path(".protest") +HISTORY_FILE = "history.jsonl" + + +def load_history( + history_dir: Path | None = None, + n: int | None = None, + model: str | None = None, + suite: str | None = None, + evals_only: bool = False, + tests_only: bool = False, +) -> list[dict[str, Any]]: + """Load history entries with optional filtering.""" + path = (history_dir or DEFAULT_HISTORY_DIR) / HISTORY_FILE + if not path.exists(): + return [] + + entries: list[dict[str, Any]] = [] + for line in path.read_text().strip().splitlines(): + try: + entry = json.loads(line) + except json.JSONDecodeError: + continue + if evals_only and not _has_suite_kind(entry, "eval"): + continue + if tests_only and not _has_suite_kind(entry, "test"): + continue + if model and (entry.get("evals") or {}).get("model") != model: + continue + if suite and suite not in entry.get("suites", {}): + continue + entries.append(entry) + + entries.sort(key=lambda e: e.get("timestamp", "")) + if n is not None: + entries = entries[-n:] + return entries + + +def _has_suite_kind(entry: dict[str, Any], kind: str) -> bool: + """Check if entry has at least one suite with the given kind.""" + suites = entry.get("suites", {}) + for suite_data in suites.values(): + if isinstance(suite_data, dict) and suite_data.get("kind") == kind: + return True + # Legacy fallback: entries without kind field + if not any(isinstance(s, dict) and "kind" in s for s in suites.values()): + if kind == "eval": + return entry.get("evals") is not None + if kind == "test": + return entry.get("evals") is None + return False + + +def append_entry(path: Path, entry: dict[str, Any]) -> None: + """Append a single JSON entry to a JSONL file. + + Note: no file locking — concurrent writes from separate processes + could corrupt the file. In practice, protest runs are single-process + (async workers share the same process). If concurrent CI jobs write + to the same history file, consider using separate history_dir per job. + """ + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "a") as f: + f.write(json.dumps(entry, default=str) + "\n") + + +def load_previous_run( + history_dir: Path | None = None, + evals_only: bool = False, +) -> dict[str, Any] | None: + """Load the most recent history entry.""" + path = (history_dir or DEFAULT_HISTORY_DIR) / HISTORY_FILE + if not path.exists(): + return None + lines = path.read_text().strip().splitlines() + for line in reversed(lines): + try: + entry = json.loads(line) + except json.JSONDecodeError: + continue + if evals_only and entry.get("evals") is None: + continue + return entry + return None + + +def clean_dirty(history_dir: Path | None = None) -> int: + """Remove entries where git.dirty=True AND git.commit matches current HEAD. + + Returns the number of entries removed. + """ + path = (history_dir or DEFAULT_HISTORY_DIR) / HISTORY_FILE + if not path.exists(): + return 0 + + try: + current_commit = subprocess.run( + ["git", "rev-parse", "HEAD"], # noqa: S607 + capture_output=True, + text=True, + timeout=5, + check=True, + ).stdout.strip() + except (FileNotFoundError, subprocess.CalledProcessError): + return 0 + + lines = path.read_text().strip().splitlines() + kept: list[str] = [] + removed = 0 + + for line in lines: + try: + entry = json.loads(line) + except json.JSONDecodeError: + kept.append(line) + continue + git = entry.get("git") or {} + if git.get("dirty") and git.get("commit") == current_commit: + removed += 1 + else: + kept.append(line) + + if removed: + path.write_text("\n".join(kept) + "\n" if kept else "") + return removed diff --git a/tests/core/test_collector.py b/tests/core/test_collector.py index 6b02ad7..9ba8719 100644 --- a/tests/core/test_collector.py +++ b/tests/core/test_collector.py @@ -88,7 +88,7 @@ def test_collect_suite_tests(self) -> None: """Collects tests from suites.""" session = ProTestSession() suite = ProTestSuite("my_suite") - session.include_suite(suite) + session.add_suite(suite) @suite.test() def suite_test() -> None: @@ -107,7 +107,7 @@ def test_collect_mixed_tests(self) -> None: """Collects both standalone and suite tests.""" session = ProTestSession() suite = ProTestSuite("my_suite") - session.include_suite(suite) + session.add_suite(suite) @session.test() def standalone_test() -> None: @@ -129,7 +129,7 @@ def test_collect_generates_correct_node_ids(self) -> None: """Collected items have correct node_ids.""" session = ProTestSession() suite = ProTestSuite("MySuite") - session.include_suite(suite) + session.add_suite(suite) @session.test() def standalone() -> None: diff --git a/tests/core/test_parametrize.py b/tests/core/test_parametrize.py index ec567db..df8a9ac 100644 --- a/tests/core/test_parametrize.py +++ b/tests/core/test_parametrize.py @@ -190,7 +190,7 @@ def test_triple( def test_structured_data_for_reporters(self) -> None: session = ProTestSession() suite = ProTestSuite("API") - session.include_suite(suite) + session.add_suite(suite) users = ForEach(["alice"], ids=lambda u: u) diff --git a/tests/core/test_skip.py b/tests/core/test_skip.py index 437e47d..71cddb1 100644 --- a/tests/core/test_skip.py +++ b/tests/core/test_skip.py @@ -54,7 +54,7 @@ def test_normal() -> None: def test_suite_skip_decorator(self) -> None: session = ProTestSession() suite = ProTestSuite("test") - session.include_suite(suite) + session.add_suite(suite) @suite.test(skip="Suite test skipped") def test_skipped() -> None: diff --git a/tests/core/test_skipif.py b/tests/core/test_skipif.py index 65fe632..4e24388 100644 --- a/tests/core/test_skipif.py +++ b/tests/core/test_skipif.py @@ -74,7 +74,7 @@ def test_skipped() -> None: def test_suite_skip_with_callable(self) -> None: session = ProTestSession() suite = ProTestSuite("test") - session.include_suite(suite) + session.add_suite(suite) @suite.test(skip=lambda: True, skip_reason="Suite conditional skip") def test_skipped() -> None: diff --git a/tests/core/test_xfail.py b/tests/core/test_xfail.py index 8451e23..4cf1d0a 100644 --- a/tests/core/test_xfail.py +++ b/tests/core/test_xfail.py @@ -57,7 +57,7 @@ def test_normal() -> None: def test_suite_xfail_decorator(self) -> None: session = ProTestSession() suite = ProTestSuite("test") - session.include_suite(suite) + session.add_suite(suite) @suite.test(xfail="Suite test xfailed") def test_xfailed() -> None: diff --git a/tests/evals/test_e2e.py b/tests/evals/test_e2e.py new file mode 100644 index 0000000..5fbb4e8 --- /dev/null +++ b/tests/evals/test_e2e.py @@ -0,0 +1,1064 @@ +"""End-to-end tests for ProTest evals integration. + +These tests define the PUBLIC API contract. They test what the user sees: +- Session setup (EvalSession, @session.eval with ForEach/From) +- CLI behavior (protest run vs protest eval) +- Output format (scores table, trends, failure messages) +- History (JSONL format, stats, significance, clean-dirty) +- Built-in evaluators + +Implementation can change freely as long as these tests pass. +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Annotated, Any + +from protest import ForEach, From, ProTestSession +from protest.core.runner import TestRunner +from protest.evals import EvalContext, EvalSession, Metric, ModelInfo, Verdict, evaluator +from protest.evals.evaluators import ( + contains_expected, + contains_keywords, + does_not_contain, + json_valid, + matches_regex, + max_length, + min_length, + not_empty, + word_overlap, +) + +# --------------------------------------------------------------------------- +# Fixtures: deterministic evaluators + task +# --------------------------------------------------------------------------- + + +@dataclass(frozen=True, slots=True) +class FakeAccuracyResult: + """Structured result for fake accuracy evaluator.""" + + accuracy: Annotated[float, Metric] + matches_expected: Annotated[bool, Verdict] + + +@evaluator +def fake_accuracy(ctx: EvalContext) -> FakeAccuracyResult: + if ctx.expected_output and ctx.expected_output.lower() in ctx.output.lower(): + return FakeAccuracyResult(accuracy=1.0, matches_expected=True) + return FakeAccuracyResult(accuracy=0.0, matches_expected=False) + + +@evaluator +async def async_fake_accuracy(ctx: EvalContext) -> FakeAccuracyResult: + """Async evaluator — simulates LLMJudge which calls an async LLM API.""" + # Simulate async I/O (e.g. LLM call) without actually blocking + if ctx.expected_output and ctx.expected_output.lower() in ctx.output.lower(): + return FakeAccuracyResult(accuracy=1.0, matches_expected=True) + return FakeAccuracyResult(accuracy=0.0, matches_expected=False) + + +def echo_task(text: str) -> str: + return f"Echo: {text}" + + +async def async_echo_task(text: str) -> str: + return f"Async: {text}" + + +basic_cases = ForEach( + [ + {"inputs": "hello world", "expected": "hello", "name": "case_pass"}, + {"inputs": "xyz", "expected": "notfound", "name": "case_fail"}, + ], + ids=lambda c: c["name"], +) + + +# --------------------------------------------------------------------------- +# Session setup +# --------------------------------------------------------------------------- + + +class TestEvalSession: + """EvalSession setup: constructor with model=, @session.eval.""" + + def test_add_eval_creates_eval_kind(self) -> None: + session = EvalSession() + + @session.eval(evaluators=[fake_accuracy]) + def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: + return echo_task(case["inputs"]) + + # The session should have a suite with kind=eval + assert len(session._suites) > 0 + assert any(s.kind == "eval" for s in session._suites) + + def test_model_set_via_constructor(self) -> None: + session = EvalSession(model=ModelInfo(name="test-model")) + assert session._eval_model is not None + assert session._eval_model.name == "test-model" + + def test_metadata_on_constructor(self) -> None: + session = EvalSession(metadata={"env": "test"}) + assert session.metadata["env"] == "test" + + def test_eval_with_bool_verdict(self) -> None: + """Evaluator with bool field: case_fail has matches_expected=False -> fail.""" + session = EvalSession() + + @session.eval(evaluators=[fake_accuracy]) + def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: + return echo_task(case["inputs"]) + + runner = TestRunner(session) + result = runner.run() + # case_pass returns matches_expected=True -> pass + # case_fail returns matches_expected=False -> fail + assert result.success is False + + def test_async_task_works(self) -> None: + session = EvalSession() + + @session.eval(evaluators=[fake_accuracy]) + async def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: + return await async_echo_task(case["inputs"]) + + runner = TestRunner(session) + runner.run() + + def test_async_evaluator_does_not_crash(self) -> None: + """Regression: async evaluator called via evaluate_sync raised 'event loop already running'.""" + single_case = ForEach( + [ + {"inputs": "hello world", "expected": "hello", "name": "c1"}, + ], + ids=lambda c: c["name"], + ) + + session = EvalSession() + + @session.eval(evaluators=[async_fake_accuracy]) + def eval_echo(case: Annotated[dict, From(single_case)]) -> str: + return echo_task(case["inputs"]) + + runner = TestRunner(session) + result = runner.run() + assert result.success is True + + +# --------------------------------------------------------------------------- +# Kind filtering (protest run vs protest eval) +# --------------------------------------------------------------------------- + + +class TestKindFiltering: + """Suites have kind, filtering works.""" + + def test_test_suite_has_kind_test(self) -> None: + from protest.core.suite import ProTestSuite + + suite = ProTestSuite("my_tests") + assert suite.kind == "test" + + def test_eval_suite_has_kind_eval(self) -> None: + session = EvalSession() + + @session.eval(evaluators=[fake_accuracy]) + def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: + return echo_task(case["inputs"]) + + assert any(s.kind == "eval" for s in session._suites) + + def test_kind_filter_keeps_only_matching(self) -> None: + from protest.core.suite import ProTestSuite + from protest.filters.kind import KindFilterPlugin + + test_suite = ProTestSuite("tests") + eval_suite = ProTestSuite("evals", kind="eval") + + session = ProTestSession() + + @test_suite.test() + def test_one() -> None: + pass + + @eval_suite.test(is_eval=True) + def eval_one() -> None: + pass + + session.add_suite(test_suite) + session.add_suite(eval_suite) + + from protest.core.collector import Collector + + items = Collector().collect(session) + assert len(items) == 2 + + # Filter to eval only + plugin = KindFilterPlugin(kind="eval") + filtered = plugin.on_collection_finish(items) + assert len(filtered) == 1 + assert filtered[0].suite.kind == "eval" + + def test_unified_session_runs_tests_only(self) -> None: + """protest run behavior: only kind=test suites.""" + from protest.core.suite import ProTestSuite + + session = ProTestSession() + + test_suite = ProTestSuite("unit") + results: list[str] = [] + + @test_suite.test() + def test_a() -> None: + results.append("test") + + session.add_suite(test_suite) + + @session.eval(evaluators=[fake_accuracy]) + def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: + return echo_task(case["inputs"]) + + from protest.api import run_session + from protest.plugin import PluginContext + + ctx = PluginContext(args={"kind_filter": "test"}) + run_session(session, ctx=ctx) + + assert "test" in results + + def test_unified_session_runs_evals_only(self) -> None: + """protest eval behavior: only kind=eval suites.""" + from protest.core.suite import ProTestSuite + + session = ProTestSession() + + test_suite = ProTestSuite("unit") + test_ran = [] + + @test_suite.test() + def test_a() -> None: + test_ran.append(True) + + session.add_suite(test_suite) + + @session.eval(evaluators=[fake_accuracy]) + def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: + return echo_task(case["inputs"]) + + from protest.api import run_session + from protest.plugin import PluginContext + + ctx = PluginContext(args={"kind_filter": "eval"}) + run_session(session, ctx=ctx) + + assert len(test_ran) == 0 # test suite was filtered out + + +# --------------------------------------------------------------------------- +# Output format +# --------------------------------------------------------------------------- + + +class TestEvalOutput: + """What the user sees in the terminal. + + These tests verify output by reading the EvalPlugin report directly, + since ProTest captures stdout during test runs. + """ + + def test_report_contains_score_stats(self) -> None: + from protest.evals.types import EvalSuiteReport + from protest.plugin import PluginBase + + reports: list[EvalSuiteReport] = [] + + class ReportCapture(PluginBase): + name = "report-capture" + description = "Captures eval reports" + + def on_eval_suite_end(self, report: Any) -> None: + reports.append(report) + + session = EvalSession() + session.register_plugin(ReportCapture()) + + @session.eval(evaluators=[fake_accuracy]) + def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: + return echo_task(case["inputs"]) + + runner = TestRunner(session) + runner.run() + + assert len(reports) == 1 + stats = reports[0].all_score_stats() + assert len(stats) > 0 + assert any(s.name == "accuracy" for s in stats) + + def test_report_has_pass_count(self) -> None: + from protest.evals.types import EvalSuiteReport + from protest.plugin import PluginBase + + reports: list[EvalSuiteReport] = [] + + class ReportCapture(PluginBase): + name = "report-capture" + description = "Captures eval reports" + + def on_eval_suite_end(self, report: Any) -> None: + reports.append(report) + + session = EvalSession() + session.register_plugin(ReportCapture()) + + @session.eval(evaluators=[fake_accuracy]) + def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: + return echo_task(case["inputs"]) + + runner = TestRunner(session) + runner.run() + + assert len(reports) == 1 + assert reports[0].total_count == 2 + + def test_failed_eval_has_error_with_score_details(self) -> None: + """When an eval case fails, the error message includes score details.""" + from protest.plugin import PluginBase + + errors: list[Any] = [] + + class ErrorCollector(PluginBase): + name = "error-collector" + + def on_test_fail(self, result: Any) -> None: + if result.error: + errors.append(str(result.error)) + + session = EvalSession() + session.register_plugin(ErrorCollector()) + + @session.eval(evaluators=[fake_accuracy]) + def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: + return echo_task(case["inputs"]) + + from protest.api import run_session + + run_session(session) + + # case_fail has matches_expected=False + assert any("matches_expected=" in e for e in errors) + + +# --------------------------------------------------------------------------- +# EvalPayload flow +# --------------------------------------------------------------------------- + + +class TestEvalPayloadFlow: + """EvalPayload flows through the framework correctly.""" + + def test_test_result_has_eval_payload(self) -> None: + from protest.plugin import PluginBase + + collected: list[Any] = [] + + class Collector(PluginBase): + name = "collector" + + def on_test_pass(self, result: Any) -> None: + collected.append(result) + + def on_test_fail(self, result: Any) -> None: + collected.append(result) + + session = EvalSession() + session.register_plugin(Collector()) + + @session.eval(evaluators=[fake_accuracy]) + def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: + return echo_task(case["inputs"]) + + runner = TestRunner(session) + runner.run() + + assert len(collected) == 2 + for result in collected: + assert result.is_eval is True + assert result.eval_payload is not None + assert result.eval_payload.case_name in ("case_pass", "case_fail") + assert "accuracy" in result.eval_payload.scores + assert "matches_expected" in result.eval_payload.scores + + def test_lifecycle_events_have_case_id_in_node_id(self) -> None: + """setup_done/teardown_start events carry node_id with [case_id].""" + from protest.plugin import PluginBase + + setup_ids: list[str] = [] + teardown_ids: list[str] = [] + + class LifecycleCollector(PluginBase): + name = "lifecycle-collector" + + def on_test_setup_done(self, info: Any) -> None: + setup_ids.append(info.node_id) + + def on_test_teardown_start(self, info: Any) -> None: + teardown_ids.append(info.node_id) + + session = EvalSession() + session.register_plugin(LifecycleCollector()) + + @session.eval(evaluators=[fake_accuracy]) + def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: + return echo_task(case["inputs"]) + + runner = TestRunner(session) + runner.run() + + assert len(setup_ids) == 2 + for node_id in setup_ids: + assert "[" in node_id, f"node_id missing case id: {node_id}" + for node_id in teardown_ids: + assert "[" in node_id, f"node_id missing case id: {node_id}" + + def test_evaluator_exception_is_error_not_fail(self) -> None: + """An evaluator that raises is treated as error (infra), not test fail.""" + from protest.plugin import PluginBase + + results: list[Any] = [] + + class Collector(PluginBase): + name = "collector" + + def on_test_fail(self, result: Any) -> None: + results.append(result) + + @evaluator + def crashing_evaluator(ctx: EvalContext) -> bool: + raise RuntimeError("LLM judge timeout") + + single_case = ForEach( + [ + {"inputs": "hello", "expected": "hello", "name": "c1"}, + ], + ids=lambda c: c["name"], + ) + + session = EvalSession() + session.register_plugin(Collector()) + + @session.eval(evaluators=[crashing_evaluator]) + def eval_echo(case: Annotated[dict, From(single_case)]) -> str: + return echo_task(case["inputs"]) + + runner = TestRunner(session) + runner.run() + + assert len(results) == 1 + assert results[0].is_fixture_error is True + assert "LLM judge timeout" in str(results[0].error) + + def test_non_eval_test_has_no_payload(self) -> None: + from protest.plugin import PluginBase + + collected: list[Any] = [] + + class Collector(PluginBase): + name = "collector" + + def on_test_pass(self, result: Any) -> None: + collected.append(result) + + session = ProTestSession() + session.register_plugin(Collector()) + + @session.test() + def regular_test() -> None: + assert True + + runner = TestRunner(session) + runner.run() + + assert len(collected) == 1 + assert collected[0].is_eval is False + assert collected[0].eval_payload is None + + +# --------------------------------------------------------------------------- +# History +# --------------------------------------------------------------------------- + + +class TestHistory: + """JSONL history format and querying.""" + + def _run_eval(self, tmp_path: Path) -> None: + from protest.api import run_session + + session = EvalSession(model=ModelInfo(name="test-model"), history_dir=tmp_path) + + @session.eval(evaluators=[fake_accuracy]) + def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: + return echo_task(case["inputs"]) + + run_session(session) + + def test_history_file_created(self, tmp_path: Path) -> None: + self._run_eval(tmp_path) + assert (tmp_path / "history.jsonl").exists() + + def test_history_entry_format(self, tmp_path: Path) -> None: + self._run_eval(tmp_path) + lines = (tmp_path / "history.jsonl").read_text().strip().splitlines() + entry = json.loads(lines[0]) + + # Required top-level keys + assert "run_id" in entry + assert "timestamp" in entry + assert "git" in entry + assert "environment" in entry + assert "metadata" in entry + assert "evals" in entry + assert "suites" in entry + + # Evals block + assert entry["evals"] is not None + assert entry["evals"]["model"] == "test-model" + + # Suites with kind + suites = entry["suites"] + assert len(suites) == 1 + suite_name = next(iter(suites)) + suite = suites[suite_name] + assert suite["kind"] == "eval" + assert "total_cases" in suite + assert "passed" in suite + assert "cases" in suite + + def test_history_test_run_has_null_evals(self, tmp_path: Path) -> None: + from protest.api import run_session + + session = ProTestSession(history=True, history_dir=tmp_path) + + @session.test() + def test_simple() -> None: + pass + + run_session(session) + + lines = (tmp_path / "history.jsonl").read_text().strip().splitlines() + entry = json.loads(lines[0]) + assert entry["evals"] is None + + def test_history_multiple_runs_append(self, tmp_path: Path) -> None: + self._run_eval(tmp_path) + self._run_eval(tmp_path) + lines = (tmp_path / "history.jsonl").read_text().strip().splitlines() + assert len(lines) == 2 + + def test_history_metadata_included(self, tmp_path: Path) -> None: + from protest.api import run_session + + session = EvalSession( + history_dir=tmp_path, + metadata={"env": "test", "version": "1.0"}, + ) + + @session.eval(evaluators=[fake_accuracy]) + def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: + return echo_task(case["inputs"]) + + run_session(session) + + lines = (tmp_path / "history.jsonl").read_text().strip().splitlines() + entry = json.loads(lines[0]) + assert entry["metadata"]["env"] == "test" + + +# --------------------------------------------------------------------------- +# History: clean-dirty +# --------------------------------------------------------------------------- + + +class TestCleanDirty: + """protest history --clean-dirty behavior.""" + + def test_clean_dirty_removes_current_head_only(self, tmp_path: Path) -> None: + # Entry with current HEAD + dirty + import subprocess + + from protest.history.storage import append_entry, clean_dirty + + try: + current_commit = subprocess.run( + ["git", "rev-parse", "HEAD"], + capture_output=True, + text=True, + timeout=5, + check=True, + ).stdout.strip() + except (FileNotFoundError, subprocess.CalledProcessError): + return # skip if not in a git repo + + path = tmp_path / "history.jsonl" + + # Dirty entry on current HEAD -> should be removed + append_entry( + path, {"git": {"commit": current_commit, "dirty": True}, "suites": {}} + ) + # Dirty entry on old commit -> should be preserved + append_entry(path, {"git": {"commit": "old123", "dirty": True}, "suites": {}}) + # Clean entry on current HEAD -> should be preserved + append_entry( + path, {"git": {"commit": current_commit, "dirty": False}, "suites": {}} + ) + + removed = clean_dirty(history_dir=tmp_path) + assert removed == 1 + + lines = path.read_text().strip().splitlines() + assert len(lines) == 2 + + +# --------------------------------------------------------------------------- +# Case hashing +# --------------------------------------------------------------------------- + + +class TestCaseHashing: + """Content hashing for eval integrity.""" + + def test_case_hash_stored_in_history(self, tmp_path: Path) -> None: + """History entries include case_hash and eval_hash per case.""" + from protest.api import run_session + + session = EvalSession(history_dir=tmp_path) + + @session.eval(evaluators=[fake_accuracy]) + def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: + return echo_task(case["inputs"]) + + run_session(session) + + lines = (tmp_path / "history.jsonl").read_text().strip().splitlines() + entry = json.loads(lines[0]) + suites = entry["suites"] + suite = next(iter(suites.values())) + case = next(iter(suite["cases"].values())) + assert "case_hash" in case + assert "eval_hash" in case + assert len(case["case_hash"]) > 0 + assert len(case["eval_hash"]) > 0 + + def test_case_hash_changes_on_input_change(self) -> None: + """Different inputs -> different case_hash.""" + from protest.evals.hashing import compute_case_hash + + h1 = compute_case_hash("hello world", "expected") + h2 = compute_case_hash("hello world modified", "expected") + assert h1 != h2 + + def test_case_hash_stable_for_same_input(self) -> None: + """Same inputs -> same case_hash (deterministic).""" + from protest.evals.hashing import compute_case_hash + + h1 = compute_case_hash("hello world", "expected") + h2 = compute_case_hash("hello world", "expected") + assert h1 == h2 + + def test_eval_hash_changes_on_evaluator_change(self) -> None: + """Different evaluators -> different eval_hash.""" + from protest.evals.hashing import compute_eval_hash + + e1 = contains_keywords(keywords=["hello"]) + e2 = contains_keywords(keywords=["hello", "world"]) + h1 = compute_eval_hash([e1]) + h2 = compute_eval_hash([e2]) + assert h1 != h2 + + +# --------------------------------------------------------------------------- +# Built-in evaluators +# --------------------------------------------------------------------------- + + +class TestBuiltinEvaluators: + """All built-in evaluators work correctly through protest-native API.""" + + def _make_ctx(self, output: str, expected: str | None = None) -> EvalContext: + """Minimal EvalContext for evaluator testing.""" + return EvalContext( + name="test", + inputs="", + output=output, + expected_output=expected, + metadata=None, + duration=0.0, + ) + + def test_contains_keywords(self) -> None: + e = contains_keywords(keywords=["hello", "world"]) + result = e(self._make_ctx("Hello World")) + assert result.keyword_recall == 1.0 + assert result.all_keywords_present is True + + def test_contains_expected(self) -> None: + e = contains_expected + assert e(self._make_ctx("Hello World", "world")) is True + assert e(self._make_ctx("Hello", "world")) is False + + def test_does_not_contain(self) -> None: + e = does_not_contain(forbidden=["cat", "dog"]) + assert e(self._make_ctx("Yorkshire")).no_forbidden_words is True + assert e(self._make_ctx("I like cats")).no_forbidden_words is False + + def test_not_empty(self) -> None: + assert not_empty(self._make_ctx("hello")) is True + assert not_empty(self._make_ctx("")) is False + assert not_empty(self._make_ctx(" ")) is False + + def test_max_length(self) -> None: + e = max_length(max_chars=5) + result = e(self._make_ctx("hi")) + assert result.within_limit is True + result = e(self._make_ctx("this is too long")) + assert result.within_limit is False + + def test_min_length(self) -> None: + assert min_length(min_chars=3)(self._make_ctx("hello")) is True + assert min_length(min_chars=10)(self._make_ctx("hi")) is False + + def test_matches_regex(self) -> None: + e = matches_regex(pattern=r"\d{3}-\d{4}") + assert e(self._make_ctx("Call 555-1234")) is True + assert e(self._make_ctx("no numbers")) is False + + def test_json_valid(self) -> None: + e = json_valid(required_keys=["name"]) + result = e(self._make_ctx('{"name": "Rex"}')) + assert result.valid_json is True + assert result.has_required_keys is True + result = e(self._make_ctx("not json")) + assert result.valid_json is False + + def test_word_overlap(self) -> None: + e = word_overlap + assert e(self._make_ctx("hello world", "hello world")).overlap == 1.0 + assert e(self._make_ctx("hello there", "hello world")).overlap == 0.5 + assert e(self._make_ctx("foo", "hello world")).overlap == 0.0 + + +# --------------------------------------------------------------------------- +# Scoring v2: bool verdict, tracking-only metrics +# --------------------------------------------------------------------------- + + +class TestScoringV2: + """Scoring v2: evaluators return bool or dataclass.""" + + def test_bool_evaluator_pass(self) -> None: + """Evaluator returning True -> case passes.""" + from protest.plugin import PluginBase + + results: list[Any] = [] + + class Collector(PluginBase): + name = "collector" + + def on_test_pass(self, result: Any) -> None: + results.append(result) + + def on_test_fail(self, result: Any) -> None: + results.append(result) + + single_case = ForEach( + [ + {"inputs": "hello world", "expected": "hello", "name": "c1"}, + ], + ids=lambda c: c["name"], + ) + + session = EvalSession() + session.register_plugin(Collector()) + + @session.eval(evaluators=[not_empty]) + def eval_echo(case: Annotated[dict, From(single_case)]) -> str: + return echo_task(case["inputs"]) + + runner = TestRunner(session) + result = runner.run() + + assert result.success is True + assert len(results) == 1 + assert results[0].eval_payload.scores["not_empty"].value is True + + def test_dataclass_without_bool_is_tracking_only(self) -> None: + """Dataclass with only float fields -> tracking-only, always passes.""" + from protest.plugin import PluginBase + + results: list[Any] = [] + + class Collector(PluginBase): + name = "collector" + + def on_test_pass(self, result: Any) -> None: + results.append(result) + + def on_test_fail(self, result: Any) -> None: + results.append(result) + + single_case = ForEach( + [ + {"inputs": "foo", "expected": "bar baz", "name": "c1"}, + ], + ids=lambda c: c["name"], + ) + + session = EvalSession() + session.register_plugin(Collector()) + + @session.eval(evaluators=[word_overlap]) + def eval_echo(case: Annotated[dict, From(single_case)]) -> str: + return echo_task(case["inputs"]) + + runner = TestRunner(session) + result = runner.run() + + # word_overlap returns only float -> tracking-only, always passes + assert result.success is True + + def test_float_return_raises_type_error(self) -> None: + """Evaluator returning naked float -> TypeError (caught as fixture error).""" + from protest.plugin import PluginBase + + results: list[Any] = [] + + class Collector(PluginBase): + name = "collector" + + def on_test_fail(self, result: Any) -> None: + results.append(result) + + @evaluator + def bad_evaluator(ctx: EvalContext) -> float: + return 0.5 + + single_case = ForEach( + [{"inputs": "hello", "expected": "hello", "name": "c1"}], + ids=lambda c: c["name"], + ) + + session = EvalSession() + session.register_plugin(Collector()) + + @session.eval(evaluators=[bad_evaluator]) + def eval_echo(case: Annotated[dict, From(single_case)]) -> str: + return echo_task(case["inputs"]) + + runner = TestRunner(session) + runner.run() + + assert len(results) == 1 + assert results[0].is_fixture_error is True + + +# --------------------------------------------------------------------------- +# Results files per run +# --------------------------------------------------------------------------- + + +class TestResultsFiles: + """Per-case markdown files written to .protest/results/_/.""" + + def _run_eval(self, tmp_path: Path) -> Path: + from protest.evals.results_writer import EvalResultsWriter + + results_dir = tmp_path / "results" + session = EvalSession() + writer = EvalResultsWriter(history_dir=tmp_path) + session.register_plugin(writer) + + @session.eval(evaluators=[fake_accuracy]) + def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: + return echo_task(case["inputs"]) + + runner = TestRunner(session) + runner.run() + return results_dir + + def test_results_dir_created(self, tmp_path: Path) -> None: + results_dir = self._run_eval(tmp_path) + assert results_dir.exists() + + def test_one_file_per_case(self, tmp_path: Path) -> None: + results_dir = self._run_eval(tmp_path) + run_dirs = list(results_dir.iterdir()) + assert len(run_dirs) == 1 + case_files = list(run_dirs[0].iterdir()) + assert len(case_files) == 2 # case_pass + case_fail + + def test_case_file_contains_output(self, tmp_path: Path) -> None: + results_dir = self._run_eval(tmp_path) + run_dir = next(results_dir.iterdir()) + pass_file = next(f for f in run_dir.iterdir() if "pass" in f.name) + content = pass_file.read_text() + assert "Echo:" in content # task output + assert "PASS" in content + + def test_case_file_contains_scores(self, tmp_path: Path) -> None: + results_dir = self._run_eval(tmp_path) + run_dir = next(results_dir.iterdir()) + pass_file = next(f for f in run_dir.iterdir() if "pass" in f.name) + content = pass_file.read_text() + assert "accuracy" in content + + def test_case_file_contains_inputs(self, tmp_path: Path) -> None: + results_dir = self._run_eval(tmp_path) + run_dir = next(results_dir.iterdir()) + pass_file = next(f for f in run_dir.iterdir() if "pass" in f.name) + content = pass_file.read_text() + assert "hello world" in content # from case inputs + + +# --------------------------------------------------------------------------- +# Multi-dataset history (regression: all suites were merged under one name) +# --------------------------------------------------------------------------- + + +class TestMultiDatasetHistory: + """Multiple @session.eval calls produce distinct suites in history.""" + + def _run_multi(self, tmp_path: Path) -> dict[str, Any]: + from protest.api import run_session + + pipeline_cases = ForEach( + [ + {"inputs": "hello", "expected": "hello", "name": "c1"}, + ], + ids=lambda c: c["name"], + ) + + ingest_cases = ForEach( + [ + {"inputs": "world", "expected": "world", "name": "c2"}, + ], + ids=lambda c: c["name"], + ) + + session = EvalSession(history_dir=tmp_path) + + @session.eval(evaluators=[fake_accuracy]) + def pipeline(case: Annotated[dict, From(pipeline_cases)]) -> str: + return echo_task(case["inputs"]) + + @session.eval(evaluators=[fake_accuracy]) + def ingest(case: Annotated[dict, From(ingest_cases)]) -> str: + return echo_task(case["inputs"]) + + run_session(session) + + history = (tmp_path / "history.jsonl").read_text().splitlines() + return json.loads(history[-1]) + + def test_two_datasets_produce_two_suites_in_history(self, tmp_path: Path) -> None: + entry = self._run_multi(tmp_path) + assert "pipeline" in entry["suites"] + assert "ingest" in entry["suites"] + + def test_each_suite_has_its_own_cases(self, tmp_path: Path) -> None: + entry = self._run_multi(tmp_path) + assert "c1" in entry["suites"]["pipeline"]["cases"] + assert "c2" in entry["suites"]["ingest"]["cases"] + + +# --------------------------------------------------------------------------- +# DI fixture injection dans les taches eval +# --------------------------------------------------------------------------- + + +class TestEvalTaskFixtures: + """@session.eval() peut utiliser des fixtures protest via Use().""" + + def test_task_without_fixtures_still_works(self) -> None: + # basic_cases has one match (case_pass) and one mismatch (case_fail) + # fake_accuracy returns matches_expected=False for case_fail -> fail + session = EvalSession() + + @session.eval(evaluators=[fake_accuracy]) + def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: + return echo_task(case["inputs"]) + + runner = TestRunner(session) + result = runner.run() + assert result.success is False # case_fail has matches_expected=False + + def test_task_with_session_fixture_is_injected(self) -> None: + """Une fixture session-scoped est injectee dans task via Use().""" + from protest import Use, fixture + + @fixture() + def prefix_service() -> str: + return "PREFIX" + + single_case = ForEach( + [ + {"inputs": "hello", "expected": "PREFIX:hello", "name": "c1"}, + ], + ids=lambda c: c["name"], + ) + + session = EvalSession() + session.bind(prefix_service) + + @session.eval(evaluators=[fake_accuracy]) + async def eval_prefixed( + case: Annotated[dict, From(single_case)], + svc: Annotated[str, Use(prefix_service)], + ) -> str: + return f"{svc}:{case['inputs']}" + + runner = TestRunner(session) + result = runner.run() + + # fake_accuracy retourne 1.0 (output contient expected) -> passe + assert result.success is True + + def test_session_fixture_resolved_once_for_all_cases(self) -> None: + """Une session fixture ne doit etre appelee qu'une fois meme avec N cas.""" + from protest import Use, fixture + + call_count = 0 + + @fixture() + def expensive_resource() -> str: + nonlocal call_count + call_count += 1 + return "resource" + + multi_cases = ForEach( + [ + {"inputs": "a", "expected": "resource:a", "name": "c1"}, + {"inputs": "b", "expected": "resource:b", "name": "c2"}, + {"inputs": "c", "expected": "resource:c", "name": "c3"}, + ], + ids=lambda c: c["name"], + ) + + session = EvalSession() + session.bind(expensive_resource) + + @session.eval(evaluators=[fake_accuracy]) + async def eval_resource( + case: Annotated[dict, From(multi_cases)], + res: Annotated[str, Use(expensive_resource)], + ) -> str: + return f"{res}:{case['inputs']}" + + runner = TestRunner(session) + runner.run() + + assert call_count == 1 # fixture resolue une seule fois diff --git a/tests/evals/test_hashing.py b/tests/evals/test_hashing.py new file mode 100644 index 0000000..bc53e1f --- /dev/null +++ b/tests/evals/test_hashing.py @@ -0,0 +1,72 @@ +"""Tests for protest.evals.hashing — including non-picklable dataclass fields.""" + +from __future__ import annotations + +import dataclasses +import threading + +from protest.evals.hashing import _canonical, compute_eval_hash + +# --------------------------------------------------------------------------- +# _canonical — dataclass handling +# --------------------------------------------------------------------------- + + +@dataclasses.dataclass +class SimpleEvaluator: + threshold: float + name: str = "simple" + + +@dataclasses.dataclass +class NestedEvaluator: + inner: SimpleEvaluator + weight: float = 1.0 + + +@dataclasses.dataclass +class LockHoldingEvaluator: + """Simulates evaluators like LLMJudge that hold non-picklable resources.""" + + name: str + _lock: threading.Lock = dataclasses.field(default_factory=threading.Lock) + + +class TestCanonicalDataclass: + def test_simple_dataclass_is_serialized(self) -> None: + ev = SimpleEvaluator(threshold=0.8) + result = _canonical(ev) + assert result == {"threshold": 0.8, "name": "simple"} + + def test_nested_dataclass_is_serialized_recursively(self) -> None: + ev = NestedEvaluator(inner=SimpleEvaluator(threshold=0.5), weight=2.0) + result = _canonical(ev) + assert result == {"inner": {"threshold": 0.5, "name": "simple"}, "weight": 2.0} + + def test_dataclass_with_lock_does_not_crash(self) -> None: + """Regression: dataclasses.asdict() deepcopy fails on threading.Lock.""" + ev = LockHoldingEvaluator(name="llm_judge") + # Must not raise — lock falls back to repr() + result = _canonical(ev) + assert result["name"] == "llm_judge" + assert "_lock" in result + + +class TestComputeEvalHash: + def test_identical_evaluators_produce_same_hash(self) -> None: + ev = SimpleEvaluator(threshold=0.8) + h1 = compute_eval_hash([ev]) + h2 = compute_eval_hash([ev]) + assert h1 == h2 + + def test_different_thresholds_produce_different_hashes(self) -> None: + ev_a = SimpleEvaluator(threshold=0.8) + ev_b = SimpleEvaluator(threshold=0.9) + assert compute_eval_hash([ev_a]) != compute_eval_hash([ev_b]) + + def test_evaluator_with_lock_does_not_crash(self) -> None: + """Regression for non-picklable evaluator fields.""" + ev = LockHoldingEvaluator(name="llm_judge") + # Should not raise TypeError about cannot pickle '_thread.lock' + hash_val = compute_eval_hash([ev]) + assert len(hash_val) == 12 From 29204bc831f738938c65409591b5d013b795a0e3 Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Sun, 29 Mar 2026 20:30:00 +0200 Subject: [PATCH 05/60] chore: entity exports, pyproject config --- docs/evals.md | 97 +++++++--- protest/entities/__init__.py | 4 + protest/entities/core.py | 2 + protest/entities/suite_path.py | 5 + pyproject.toml | 20 ++ uv.lock | 327 ++++++++++++++++++++++++++++++++- 6 files changed, 425 insertions(+), 30 deletions(-) diff --git a/docs/evals.md b/docs/evals.md index 11895aa..1ff3235 100644 --- a/docs/evals.md +++ b/docs/evals.md @@ -1,6 +1,6 @@ # Evals -Evaluate LLM outputs with scored metrics, thresholds, and historical tracking. +Evaluate LLM outputs with scored metrics and historical tracking. ## What is an Eval? @@ -15,7 +15,7 @@ ProTest evals use the same infrastructure as tests: fixtures, DI, parallelism, t from typing import Annotated from protest import ForEach, From -from protest.evals import EvalCase, EvalSession, evaluator +from protest.evals import EvalCase, EvalSession, ModelInfo, evaluator from protest.evals.evaluators import contains_keywords cases = ForEach([ @@ -23,7 +23,7 @@ cases = ForEach([ EvalCase(inputs="What is 2+2?", expected="4", name="math"), ]) -session = EvalSession() +session = EvalSession(model=ModelInfo(name="gpt-4o-mini")) @session.eval(evaluators=[contains_keywords(keywords=["Marie"])]) async def chatbot(case: Annotated[EvalCase, From(cases)]) -> str: @@ -41,7 +41,7 @@ protest eval evals.session:session 1. Your function receives case data via `ForEach`/`From` (same as parameterized tests) 2. It returns the output (string, object, anything) 3. ProTest passes the output to evaluators → scores -4. Scores determine pass/fail via thresholds +4. Bool verdicts determine pass/fail 5. Aggregated stats appear in the terminal The rest of the pipeline — fixtures, DI, parallelism, reporters — works identically to tests. @@ -87,15 +87,44 @@ An evaluator is a function decorated with `@evaluator` that receives an `EvalCon ### Return Types -Evaluators return `bool` (simple verdict) or a `dataclass` (structured result). The framework reads fields by type: +Evaluators return `bool` (simple verdict) or a `dataclass` (structured result). In dataclasses, annotate fields to tell the framework what each one is: -| Field Type | Role | +```python +from typing import Annotated +from protest.evals import Metric, Verdict, Reason +``` + +| Annotation | Role | |------------|------| -| `bool` | Verdict — pass/fail (`all(bool_fields)`) | -| `float` | Metric — aggregated in stats (mean/p50/p95) | -| `str` | Reason — displayed on failure, stored in history | +| `Annotated[bool, Verdict]` | Verdict — pass/fail (`all(verdicts)`) | +| `Annotated[float, Metric]` | Metric — aggregated in stats (mean/p50/p95) | +| `Annotated[int, Metric]` | Metric — converted to float | +| `Annotated[str, Reason]` | Reason — displayed on failure, stored in history | + +Unannotated fields are ignored by the runner — free metadata. + +Returning `float`, `dict`, or any other non-dataclass/non-bool type raises `TypeError`. + +### Tracking-Only Evaluators + +A dataclass with `Metric` fields but no `Verdict` is tracking-only. The case always passes for this evaluator — it measures without gating. + +```python +@dataclass +class OverlapMetrics: + overlap: Annotated[float, Metric] + +@evaluator +def word_overlap(ctx: EvalContext) -> OverlapMetrics: + ... +``` + +In the terminal, tracking evaluators show with `·` instead of `✓`/`✗`: -Returning `float`, `dict`, or any other type raises `TypeError`. +``` +✓ chatbot[lookup] (1.2s) keyword_recall=0.95 all_present=✓ +· chatbot[lookup] overlap=0.80 +``` ### Simple Evaluator @@ -109,12 +138,14 @@ def not_empty(ctx: EvalContext) -> bool: ```python from dataclasses import dataclass +from typing import Annotated +from protest.evals import Metric, Verdict, Reason @dataclass class KeywordScores: - keyword_recall: float # metric → stats - all_present: bool # verdict → pass/fail - detail: str = "" # reason → shown on failure + keyword_recall: Annotated[float, Metric] + all_present: Annotated[bool, Verdict] + detail: Annotated[str, Reason] = "" @evaluator def keyword_check(ctx: EvalContext, keywords: list[str], min_recall: float = 0.5) -> KeywordScores: @@ -134,9 +165,9 @@ The threshold (`min_recall`) is a parameter of the evaluator, not a framework co ```python @dataclass class JudgeResult: - accuracy: float - accurate_enough: bool - reason: str = "" + accuracy: Annotated[float, Metric] + accurate_enough: Annotated[bool, Verdict] + reason: Annotated[str, Reason] = "" @evaluator async def llm_judge(ctx: EvalContext, rubric: str = "", min_score: float = 0.7) -> JudgeResult: @@ -223,9 +254,13 @@ session = EvalSession(model=ModelInfo(name="qwen-2.5")) ## Evaluator Errors -If an evaluator raises an exception (e.g. LLM judge timeout), the case is marked as **error** (not fail). The stack trace appears in the output. Scores from other evaluators that ran before the error are lost. +If an evaluator raises an exception (e.g. LLM judge timeout), the case is marked as **error** (not fail). The stack trace appears in the output. + +> **Tip:** For non-deterministic evaluators (LLM judges), catch exceptions in the evaluator and return a verdict indicating failure rather than letting them propagate. + +## Name Collisions -> **Tip:** For non-deterministic evaluators (LLM judges), catch exceptions in the evaluator and return a score indicating failure rather than letting them propagate. +If two evaluators return dataclasses with the same field name (e.g. both have `accuracy`), the runner prefixes with the evaluator name when it detects a conflict: `llm_judge.accuracy`, `fact_check.accuracy`. ## Multi-Model Sessions @@ -290,16 +325,20 @@ Flags are independent and combinable: `-v --show-output --show-logs`. ### Default ``` - ✓ chatbot[lookup] (3.39s) facts_score=1.00 facts_ok=✓ - ✗ chatbot[causal]: facts_ok=False, LLMJudge=False - - Eval: chatbot (26 cases) -┏━━━━━━━━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┓ -┃ Score ┃ mean ┃ p50 ┃ p5 ┃ p95 ┃ -┡━━━━━━━━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━┩ -│ facts_score │ 0.37 │ 0.00 │ 0.00 │ 1.00 │ -└─────────────┴──────┴──────┴──────┴──────┘ - Passed: 14/26 (53.8%) + ✓ chatbot[lookup] (1.2s) keyword_recall=1.00 all_keywords_present=✓ + ✗ chatbot[math]: all_keywords_present=False + │ inputs: What is 2+2? + │ output: The answer is 4. + │ expected: 4 + │ detail: found 0/1 + + Eval: chatbot (2 cases) +┏━━━━━━━━━━━━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┓ +┃ Score ┃ mean ┃ p50 ┃ p5 ┃ p95 ┃ +┡━━━━━━━━━━━━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━┩ +│ keyword_recall │ 0.50 │ 0.50 │ 0.00 │ 1.00 │ +└─────────────────┴──────┴──────┴──────┴──────┘ + Passed: 1/2 (50.0%) Results: .protest/results/chatbot_20260329_091422 ``` @@ -334,7 +373,7 @@ protest history --evals --compare Each case in history carries two hashes: - **`case_hash`** — hash of inputs + expected output. Changes when the test data changes. -- **`eval_hash`** — hash of evaluators + thresholds. Changes when the scoring criteria change. +- **`eval_hash`** — hash of evaluators. Changes when the scoring criteria change. `protest history --compare` uses these hashes to detect modified cases vs regressions. If a case's `eval_hash` changed between runs, it's reported as "scoring modified" rather than a real regression. diff --git a/protest/entities/__init__.py b/protest/entities/__init__.py index ec91eb9..30bd04e 100644 --- a/protest/entities/__init__.py +++ b/protest/entities/__init__.py @@ -10,6 +10,8 @@ format_fixture_scope, ) from protest.entities.events import ( + EvalPayload, + EvalScoreEntry, FixtureInfo, HandlerInfo, RunResult, @@ -31,6 +33,8 @@ from protest.entities.xfail import Xfail, normalize_xfail __all__ = [ + "EvalPayload", + "EvalScoreEntry", "Fixture", "FixtureCallable", "FixtureInfo", diff --git a/protest/entities/core.py b/protest/entities/core.py index 465c5d3..f5efa22 100644 --- a/protest/entities/core.py +++ b/protest/entities/core.py @@ -49,6 +49,7 @@ class TestRegistration: xfail: Xfail | None = None timeout: float | None = None retry: Retry | None = None + is_eval: bool = False @dataclass(frozen=True, slots=True) @@ -111,6 +112,7 @@ class TestItem: xfail: Xfail | None = None timeout: float | None = None retry: Retry | None = None + is_eval: bool = False @property def test_name(self) -> str: diff --git a/protest/entities/suite_path.py b/protest/entities/suite_path.py index 38c78a2..4b7223e 100644 --- a/protest/entities/suite_path.py +++ b/protest/entities/suite_path.py @@ -58,6 +58,11 @@ def lower(self) -> str: """Return lowercase string representation for case-insensitive comparison.""" return self._path.lower() + @property + def root_name(self) -> str: + """Return the top-level suite name: 'A::B::C' -> 'A'.""" + return self.parts[0] if self.parts else "" + def __str__(self) -> str: return self._path diff --git a/pyproject.toml b/pyproject.toml index 090118c..6b25e2e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,6 +49,9 @@ rich = [ web = [ "websockets>=12.0", ] +evals = [ + "pydantic-evals>=0.1", +] [tool.ruff] @@ -100,6 +103,23 @@ ignore = [ "PLC0415", # lazy import for optional rich dependency "PLR0913", # many args is deliberate API design ] +"protest/core/execution/test_executor.py" = [ + "PLR0915", # _run_test is inherently complex (retry loop + eval capture) +] +"protest/history/**" = [ + "PLC0415", # lazy imports + "S603", # subprocess git calls are safe + "PLR0913", # load_history has many filter params by design +] +"protest/cli/history.py" = [ + "T201", # print for CLI output + "PLC0415", # lazy imports +] +"protest/evals/**" = [ + "T201", # print for eval reporting + "PLC0415", # lazy imports for optional pydantic-evals dependency + "PLR0913", # adapter functions have many params by design +] "protest/reporting/ascii.py" = [ "T201", # print is the purpose of this module ] diff --git a/uv.lock b/uv.lock index d7c8a6d..aa650bb 100644 --- a/uv.lock +++ b/uv.lock @@ -2,6 +2,29 @@ version = 1 revision = 3 requires-python = ">=3.10" +[[package]] +name = "annotated-types" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" }, +] + +[[package]] +name = "anyio" +version = "4.13.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, + { name = "idna" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/19/14/2c5dd9f512b66549ae92767a9c7b330ae88e1932ca57876909410251fe13/anyio-4.13.0.tar.gz", hash = "sha256:334b70e641fd2221c1505b3890c69882fe4a2df910cba14d97019b90b24439dc", size = 231622, upload-time = "2026-03-24T12:59:09.671Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/da/42/e921fccf5015463e32a3cf6ee7f980a6ed0f395ceeaa45060b61d86486c2/anyio-4.13.0-py3-none-any.whl", hash = "sha256:08b310f9e24a9594186fd75b4f73f4a4152069e3853f1ed8bfbf58369f4ad708", size = 114353, upload-time = "2026-03-24T12:59:08.246Z" }, +] + [[package]] name = "attrs" version = "25.4.0" @@ -305,6 +328,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/76/91/7216b27286936c16f5b4d0c530087e4a54eead683e6b0b73dd0c64844af6/filelock-3.20.0-py3-none-any.whl", hash = "sha256:339b4732ffda5cd79b13f4e2711a31b0365ce445d95d243bb996273d072546a2", size = 16054, upload-time = "2025-10-08T18:03:48.35Z" }, ] +[[package]] +name = "genai-prices" +version = "0.0.56" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "httpx" }, + { name = "pydantic" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/44/6b/94b3018a672c7775edfb485f0fed8f6068fba75e49b067e8a1ac5eb96764/genai_prices-0.0.56.tar.gz", hash = "sha256:ac24b16a84d0ab97539bfa48dfa4649689de8e3ce71c12ebacef29efb1998045", size = 65872, upload-time = "2026-03-20T20:33:00.732Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a3/f6/8ef7e4c286deb2709d11ca96a5237caae3ef4876ab3c48095856cfd2df30/genai_prices-0.0.56-py3-none-any.whl", hash = "sha256:dbe86be8f3f556bed1b72209ed36851fec8b01793b3b220f42921a4e7da945f6", size = 68966, upload-time = "2026-03-20T20:33:02.555Z" }, +] + [[package]] name = "ghp-import" version = "2.1.0" @@ -317,6 +353,52 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f7/ec/67fbef5d497f86283db54c22eec6f6140243aae73265799baaaa19cd17fb/ghp_import-2.1.0-py3-none-any.whl", hash = "sha256:8337dd7b50877f163d4c0289bc1f1c7f127550241988d568c1db512c4324a619", size = 11034, upload-time = "2022-05-02T15:47:14.552Z" }, ] +[[package]] +name = "griffelib" +version = "2.0.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/9d/82/74f4a3310cdabfbb10da554c3a672847f1ed33c6f61dd472681ce7f1fe67/griffelib-2.0.2.tar.gz", hash = "sha256:3cf20b3bc470e83763ffbf236e0076b1211bac1bc67de13daf494640f2de707e", size = 166461, upload-time = "2026-03-27T11:34:51.091Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/11/8c/c9138d881c79aa0ea9ed83cbd58d5ca75624378b38cee225dcf5c42cc91f/griffelib-2.0.2-py3-none-any.whl", hash = "sha256:925c857658fb1ba40c0772c37acbc2ab650bd794d9c1b9726922e36ea4117ea1", size = 142357, upload-time = "2026-03-27T11:34:46.275Z" }, +] + +[[package]] +name = "h11" +version = "0.16.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" }, +] + +[[package]] +name = "httpcore" +version = "1.0.9" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "h11" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484, upload-time = "2025-04-24T22:06:22.219Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784, upload-time = "2025-04-24T22:06:20.566Z" }, +] + +[[package]] +name = "httpx" +version = "0.28.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "certifi" }, + { name = "httpcore" }, + { name = "idna" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406, upload-time = "2024-12-06T15:37:23.222Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, +] + [[package]] name = "identify" version = "2.6.15" @@ -335,6 +417,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" }, ] +[[package]] +name = "importlib-metadata" +version = "8.7.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "zipp" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f3/49/3b30cad09e7771a4982d9975a8cbf64f00d4a1ececb53297f1d9a7be1b10/importlib_metadata-8.7.1.tar.gz", hash = "sha256:49fef1ae6440c182052f407c8d34a68f72efc36db9ca90dc0113398f2fdde8bb", size = 57107, upload-time = "2025-12-21T10:00:19.278Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fa/5e/f8e9a1d23b9c20a551a8a02ea3637b4642e22c2626e3a13a9a29cdea99eb/importlib_metadata-8.7.1-py3-none-any.whl", hash = "sha256:5a1f80bf1daa489495071efbb095d75a634cf28a8bc299581244063b53176151", size = 27865, upload-time = "2025-12-21T10:00:18.329Z" }, +] + [[package]] name = "iniconfig" version = "2.1.0" @@ -383,6 +477,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/41/45/1a4ed80516f02155c51f51e8cedb3c1902296743db0bbc66608a0db2814f/jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe", size = 18437, upload-time = "2025-09-08T01:34:57.871Z" }, ] +[[package]] +name = "logfire-api" +version = "4.31.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/08/a2/8d5a3c1c282d5f2bd9f5e9ddd5288d1414a53301ce389af9016b6d82bd50/logfire_api-4.31.0.tar.gz", hash = "sha256:fc4b01257ebd4ce297ad374ed201eb1a9213b999f6ae6df45cfca5bd0ef378f8", size = 77838, upload-time = "2026-03-27T19:00:47.545Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/26/27/9372b7492b3e146908d520f8599909311cd930175801ad219171fafc6f3e/logfire_api-4.31.0-py3-none-any.whl", hash = "sha256:3c1f502fd4eb8ef0996427a5cf275fd8f327f38600650a1f53071a8171c812db", size = 123402, upload-time = "2026-03-27T19:00:44.952Z" }, +] + [[package]] name = "markdown" version = "3.10" @@ -585,6 +688,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9", size = 22314, upload-time = "2024-06-04T18:44:08.352Z" }, ] +[[package]] +name = "opentelemetry-api" +version = "1.40.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "importlib-metadata" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/2c/1d/4049a9e8698361cc1a1aa03a6c59e4fa4c71e0c0f94a30f988a6876a2ae6/opentelemetry_api-1.40.0.tar.gz", hash = "sha256:159be641c0b04d11e9ecd576906462773eb97ae1b657730f0ecf64d32071569f", size = 70851, upload-time = "2026-03-04T14:17:21.555Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5f/bf/93795954016c522008da367da292adceed71cca6ee1717e1d64c83089099/opentelemetry_api-1.40.0-py3-none-any.whl", hash = "sha256:82dd69331ae74b06f6a874704be0cfaa49a1650e1537d4a813b86ecef7d0ecf9", size = 68676, upload-time = "2026-03-04T14:17:01.24Z" }, +] + [[package]] name = "packaging" version = "24.2" @@ -655,6 +771,9 @@ dependencies = [ ] [package.optional-dependencies] +evals = [ + { name = "pydantic-evals" }, +] rich = [ { name = "rich" }, ] @@ -681,11 +800,12 @@ docs = [ [package.metadata] requires-dist = [ + { name = "pydantic-evals", marker = "extra == 'evals'", specifier = ">=0.1" }, { name = "rich", marker = "extra == 'rich'", specifier = ">=13.0" }, { name = "typing-extensions", specifier = ">=4.15.0" }, { name = "websockets", marker = "extra == 'web'", specifier = ">=12.0" }, ] -provides-extras = ["rich", "web"] +provides-extras = ["rich", "web", "evals"] [package.metadata.requires-dev] dev = [ @@ -704,6 +824,190 @@ docs = [ { name = "mkdocs-material", specifier = ">=9.7.0" }, ] +[[package]] +name = "pydantic" +version = "2.12.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "annotated-types" }, + { name = "pydantic-core" }, + { name = "typing-extensions" }, + { name = "typing-inspection" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/69/44/36f1a6e523abc58ae5f928898e4aca2e0ea509b5aa6f6f392a5d882be928/pydantic-2.12.5.tar.gz", hash = "sha256:4d351024c75c0f085a9febbb665ce8c0c6ec5d30e903bdb6394b7ede26aebb49", size = 821591, upload-time = "2025-11-26T15:11:46.471Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5a/87/b70ad306ebb6f9b585f114d0ac2137d792b48be34d732d60e597c2f8465a/pydantic-2.12.5-py3-none-any.whl", hash = "sha256:e561593fccf61e8a20fc46dfc2dfe075b8be7d0188df33f221ad1f0139180f9d", size = 463580, upload-time = "2025-11-26T15:11:44.605Z" }, +] + +[[package]] +name = "pydantic-ai-slim" +version = "1.73.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, + { name = "genai-prices" }, + { name = "griffelib" }, + { name = "httpx" }, + { name = "opentelemetry-api" }, + { name = "pydantic" }, + { name = "pydantic-graph" }, + { name = "typing-inspection" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6a/1b/a5e18c7c721a3cfce5b17f86cb99e4142fcb70f38ea6d2b8963c2df445e1/pydantic_ai_slim-1.73.0.tar.gz", hash = "sha256:758d5bedb4b4f484c433672639bfc87af216a38453b1539ae10928a9ca62ff62", size = 497208, upload-time = "2026-03-27T03:49:49.459Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/3b/6aa1874cd0ccbc83c17c8eb308834bf004c8d4344c27cd8048851d4b284d/pydantic_ai_slim-1.73.0-py3-none-any.whl", hash = "sha256:f7176ce6c78539e1070d7e22549186862c2f6e6ea8b05b3aaad8a1942ba1ff4f", size = 638701, upload-time = "2026-03-27T03:49:42.804Z" }, +] + +[[package]] +name = "pydantic-core" +version = "2.41.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/71/70/23b021c950c2addd24ec408e9ab05d59b035b39d97cdc1130e1bce647bb6/pydantic_core-2.41.5.tar.gz", hash = "sha256:08daa51ea16ad373ffd5e7606252cc32f07bc72b28284b6bc9c6df804816476e", size = 460952, upload-time = "2025-11-04T13:43:49.098Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c6/90/32c9941e728d564b411d574d8ee0cf09b12ec978cb22b294995bae5549a5/pydantic_core-2.41.5-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:77b63866ca88d804225eaa4af3e664c5faf3568cea95360d21f4725ab6e07146", size = 2107298, upload-time = "2025-11-04T13:39:04.116Z" }, + { url = "https://files.pythonhosted.org/packages/fb/a8/61c96a77fe28993d9a6fb0f4127e05430a267b235a124545d79fea46dd65/pydantic_core-2.41.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dfa8a0c812ac681395907e71e1274819dec685fec28273a28905df579ef137e2", size = 1901475, upload-time = "2025-11-04T13:39:06.055Z" }, + { url = "https://files.pythonhosted.org/packages/5d/b6/338abf60225acc18cdc08b4faef592d0310923d19a87fba1faf05af5346e/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5921a4d3ca3aee735d9fd163808f5e8dd6c6972101e4adbda9a4667908849b97", size = 1918815, upload-time = "2025-11-04T13:39:10.41Z" }, + { url = "https://files.pythonhosted.org/packages/d1/1c/2ed0433e682983d8e8cba9c8d8ef274d4791ec6a6f24c58935b90e780e0a/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e25c479382d26a2a41b7ebea1043564a937db462816ea07afa8a44c0866d52f9", size = 2065567, upload-time = "2025-11-04T13:39:12.244Z" }, + { url = "https://files.pythonhosted.org/packages/b3/24/cf84974ee7d6eae06b9e63289b7b8f6549d416b5c199ca2d7ce13bbcf619/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f547144f2966e1e16ae626d8ce72b4cfa0caedc7fa28052001c94fb2fcaa1c52", size = 2230442, upload-time = "2025-11-04T13:39:13.962Z" }, + { url = "https://files.pythonhosted.org/packages/fd/21/4e287865504b3edc0136c89c9c09431be326168b1eb7841911cbc877a995/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6f52298fbd394f9ed112d56f3d11aabd0d5bd27beb3084cc3d8ad069483b8941", size = 2350956, upload-time = "2025-11-04T13:39:15.889Z" }, + { url = "https://files.pythonhosted.org/packages/a8/76/7727ef2ffa4b62fcab916686a68a0426b9b790139720e1934e8ba797e238/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:100baa204bb412b74fe285fb0f3a385256dad1d1879f0a5cb1499ed2e83d132a", size = 2068253, upload-time = "2025-11-04T13:39:17.403Z" }, + { url = "https://files.pythonhosted.org/packages/d5/8c/a4abfc79604bcb4c748e18975c44f94f756f08fb04218d5cb87eb0d3a63e/pydantic_core-2.41.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:05a2c8852530ad2812cb7914dc61a1125dc4e06252ee98e5638a12da6cc6fb6c", size = 2177050, upload-time = "2025-11-04T13:39:19.351Z" }, + { url = "https://files.pythonhosted.org/packages/67/b1/de2e9a9a79b480f9cb0b6e8b6ba4c50b18d4e89852426364c66aa82bb7b3/pydantic_core-2.41.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:29452c56df2ed968d18d7e21f4ab0ac55e71dc59524872f6fc57dcf4a3249ed2", size = 2147178, upload-time = "2025-11-04T13:39:21Z" }, + { url = "https://files.pythonhosted.org/packages/16/c1/dfb33f837a47b20417500efaa0378adc6635b3c79e8369ff7a03c494b4ac/pydantic_core-2.41.5-cp310-cp310-musllinux_1_1_armv7l.whl", hash = "sha256:d5160812ea7a8a2ffbe233d8da666880cad0cbaf5d4de74ae15c313213d62556", size = 2341833, upload-time = "2025-11-04T13:39:22.606Z" }, + { url = "https://files.pythonhosted.org/packages/47/36/00f398642a0f4b815a9a558c4f1dca1b4020a7d49562807d7bc9ff279a6c/pydantic_core-2.41.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:df3959765b553b9440adfd3c795617c352154e497a4eaf3752555cfb5da8fc49", size = 2321156, upload-time = "2025-11-04T13:39:25.843Z" }, + { url = "https://files.pythonhosted.org/packages/7e/70/cad3acd89fde2010807354d978725ae111ddf6d0ea46d1ea1775b5c1bd0c/pydantic_core-2.41.5-cp310-cp310-win32.whl", hash = "sha256:1f8d33a7f4d5a7889e60dc39856d76d09333d8a6ed0f5f1190635cbec70ec4ba", size = 1989378, upload-time = "2025-11-04T13:39:27.92Z" }, + { url = "https://files.pythonhosted.org/packages/76/92/d338652464c6c367e5608e4488201702cd1cbb0f33f7b6a85a60fe5f3720/pydantic_core-2.41.5-cp310-cp310-win_amd64.whl", hash = "sha256:62de39db01b8d593e45871af2af9e497295db8d73b085f6bfd0b18c83c70a8f9", size = 2013622, upload-time = "2025-11-04T13:39:29.848Z" }, + { url = "https://files.pythonhosted.org/packages/e8/72/74a989dd9f2084b3d9530b0915fdda64ac48831c30dbf7c72a41a5232db8/pydantic_core-2.41.5-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:a3a52f6156e73e7ccb0f8cced536adccb7042be67cb45f9562e12b319c119da6", size = 2105873, upload-time = "2025-11-04T13:39:31.373Z" }, + { url = "https://files.pythonhosted.org/packages/12/44/37e403fd9455708b3b942949e1d7febc02167662bf1a7da5b78ee1ea2842/pydantic_core-2.41.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7f3bf998340c6d4b0c9a2f02d6a400e51f123b59565d74dc60d252ce888c260b", size = 1899826, upload-time = "2025-11-04T13:39:32.897Z" }, + { url = "https://files.pythonhosted.org/packages/33/7f/1d5cab3ccf44c1935a359d51a8a2a9e1a654b744b5e7f80d41b88d501eec/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:378bec5c66998815d224c9ca994f1e14c0c21cb95d2f52b6021cc0b2a58f2a5a", size = 1917869, upload-time = "2025-11-04T13:39:34.469Z" }, + { url = "https://files.pythonhosted.org/packages/6e/6a/30d94a9674a7fe4f4744052ed6c5e083424510be1e93da5bc47569d11810/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e7b576130c69225432866fe2f4a469a85a54ade141d96fd396dffcf607b558f8", size = 2063890, upload-time = "2025-11-04T13:39:36.053Z" }, + { url = "https://files.pythonhosted.org/packages/50/be/76e5d46203fcb2750e542f32e6c371ffa9b8ad17364cf94bb0818dbfb50c/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6cb58b9c66f7e4179a2d5e0f849c48eff5c1fca560994d6eb6543abf955a149e", size = 2229740, upload-time = "2025-11-04T13:39:37.753Z" }, + { url = "https://files.pythonhosted.org/packages/d3/ee/fed784df0144793489f87db310a6bbf8118d7b630ed07aa180d6067e653a/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:88942d3a3dff3afc8288c21e565e476fc278902ae4d6d134f1eeda118cc830b1", size = 2350021, upload-time = "2025-11-04T13:39:40.94Z" }, + { url = "https://files.pythonhosted.org/packages/c8/be/8fed28dd0a180dca19e72c233cbf58efa36df055e5b9d90d64fd1740b828/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f31d95a179f8d64d90f6831d71fa93290893a33148d890ba15de25642c5d075b", size = 2066378, upload-time = "2025-11-04T13:39:42.523Z" }, + { url = "https://files.pythonhosted.org/packages/b0/3b/698cf8ae1d536a010e05121b4958b1257f0b5522085e335360e53a6b1c8b/pydantic_core-2.41.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c1df3d34aced70add6f867a8cf413e299177e0c22660cc767218373d0779487b", size = 2175761, upload-time = "2025-11-04T13:39:44.553Z" }, + { url = "https://files.pythonhosted.org/packages/b8/ba/15d537423939553116dea94ce02f9c31be0fa9d0b806d427e0308ec17145/pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:4009935984bd36bd2c774e13f9a09563ce8de4abaa7226f5108262fa3e637284", size = 2146303, upload-time = "2025-11-04T13:39:46.238Z" }, + { url = "https://files.pythonhosted.org/packages/58/7f/0de669bf37d206723795f9c90c82966726a2ab06c336deba4735b55af431/pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:34a64bc3441dc1213096a20fe27e8e128bd3ff89921706e83c0b1ac971276594", size = 2340355, upload-time = "2025-11-04T13:39:48.002Z" }, + { url = "https://files.pythonhosted.org/packages/e5/de/e7482c435b83d7e3c3ee5ee4451f6e8973cff0eb6007d2872ce6383f6398/pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c9e19dd6e28fdcaa5a1de679aec4141f691023916427ef9bae8584f9c2fb3b0e", size = 2319875, upload-time = "2025-11-04T13:39:49.705Z" }, + { url = "https://files.pythonhosted.org/packages/fe/e6/8c9e81bb6dd7560e33b9053351c29f30c8194b72f2d6932888581f503482/pydantic_core-2.41.5-cp311-cp311-win32.whl", hash = "sha256:2c010c6ded393148374c0f6f0bf89d206bf3217f201faa0635dcd56bd1520f6b", size = 1987549, upload-time = "2025-11-04T13:39:51.842Z" }, + { url = "https://files.pythonhosted.org/packages/11/66/f14d1d978ea94d1bc21fc98fcf570f9542fe55bfcc40269d4e1a21c19bf7/pydantic_core-2.41.5-cp311-cp311-win_amd64.whl", hash = "sha256:76ee27c6e9c7f16f47db7a94157112a2f3a00e958bc626e2f4ee8bec5c328fbe", size = 2011305, upload-time = "2025-11-04T13:39:53.485Z" }, + { url = "https://files.pythonhosted.org/packages/56/d8/0e271434e8efd03186c5386671328154ee349ff0354d83c74f5caaf096ed/pydantic_core-2.41.5-cp311-cp311-win_arm64.whl", hash = "sha256:4bc36bbc0b7584de96561184ad7f012478987882ebf9f9c389b23f432ea3d90f", size = 1972902, upload-time = "2025-11-04T13:39:56.488Z" }, + { url = "https://files.pythonhosted.org/packages/5f/5d/5f6c63eebb5afee93bcaae4ce9a898f3373ca23df3ccaef086d0233a35a7/pydantic_core-2.41.5-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:f41a7489d32336dbf2199c8c0a215390a751c5b014c2c1c5366e817202e9cdf7", size = 2110990, upload-time = "2025-11-04T13:39:58.079Z" }, + { url = "https://files.pythonhosted.org/packages/aa/32/9c2e8ccb57c01111e0fd091f236c7b371c1bccea0fa85247ac55b1e2b6b6/pydantic_core-2.41.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:070259a8818988b9a84a449a2a7337c7f430a22acc0859c6b110aa7212a6d9c0", size = 1896003, upload-time = "2025-11-04T13:39:59.956Z" }, + { url = "https://files.pythonhosted.org/packages/68/b8/a01b53cb0e59139fbc9e4fda3e9724ede8de279097179be4ff31f1abb65a/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e96cea19e34778f8d59fe40775a7a574d95816eb150850a85a7a4c8f4b94ac69", size = 1919200, upload-time = "2025-11-04T13:40:02.241Z" }, + { url = "https://files.pythonhosted.org/packages/38/de/8c36b5198a29bdaade07b5985e80a233a5ac27137846f3bc2d3b40a47360/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ed2e99c456e3fadd05c991f8f437ef902e00eedf34320ba2b0842bd1c3ca3a75", size = 2052578, upload-time = "2025-11-04T13:40:04.401Z" }, + { url = "https://files.pythonhosted.org/packages/00/b5/0e8e4b5b081eac6cb3dbb7e60a65907549a1ce035a724368c330112adfdd/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:65840751b72fbfd82c3c640cff9284545342a4f1eb1586ad0636955b261b0b05", size = 2208504, upload-time = "2025-11-04T13:40:06.072Z" }, + { url = "https://files.pythonhosted.org/packages/77/56/87a61aad59c7c5b9dc8caad5a41a5545cba3810c3e828708b3d7404f6cef/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e536c98a7626a98feb2d3eaf75944ef6f3dbee447e1f841eae16f2f0a72d8ddc", size = 2335816, upload-time = "2025-11-04T13:40:07.835Z" }, + { url = "https://files.pythonhosted.org/packages/0d/76/941cc9f73529988688a665a5c0ecff1112b3d95ab48f81db5f7606f522d3/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eceb81a8d74f9267ef4081e246ffd6d129da5d87e37a77c9bde550cb04870c1c", size = 2075366, upload-time = "2025-11-04T13:40:09.804Z" }, + { url = "https://files.pythonhosted.org/packages/d3/43/ebef01f69baa07a482844faaa0a591bad1ef129253ffd0cdaa9d8a7f72d3/pydantic_core-2.41.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d38548150c39b74aeeb0ce8ee1d8e82696f4a4e16ddc6de7b1d8823f7de4b9b5", size = 2171698, upload-time = "2025-11-04T13:40:12.004Z" }, + { url = "https://files.pythonhosted.org/packages/b1/87/41f3202e4193e3bacfc2c065fab7706ebe81af46a83d3e27605029c1f5a6/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c23e27686783f60290e36827f9c626e63154b82b116d7fe9adba1fda36da706c", size = 2132603, upload-time = "2025-11-04T13:40:13.868Z" }, + { url = "https://files.pythonhosted.org/packages/49/7d/4c00df99cb12070b6bccdef4a195255e6020a550d572768d92cc54dba91a/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:482c982f814460eabe1d3bb0adfdc583387bd4691ef00b90575ca0d2b6fe2294", size = 2329591, upload-time = "2025-11-04T13:40:15.672Z" }, + { url = "https://files.pythonhosted.org/packages/cc/6a/ebf4b1d65d458f3cda6a7335d141305dfa19bdc61140a884d165a8a1bbc7/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:bfea2a5f0b4d8d43adf9d7b8bf019fb46fdd10a2e5cde477fbcb9d1fa08c68e1", size = 2319068, upload-time = "2025-11-04T13:40:17.532Z" }, + { url = "https://files.pythonhosted.org/packages/49/3b/774f2b5cd4192d5ab75870ce4381fd89cf218af999515baf07e7206753f0/pydantic_core-2.41.5-cp312-cp312-win32.whl", hash = "sha256:b74557b16e390ec12dca509bce9264c3bbd128f8a2c376eaa68003d7f327276d", size = 1985908, upload-time = "2025-11-04T13:40:19.309Z" }, + { url = "https://files.pythonhosted.org/packages/86/45/00173a033c801cacf67c190fef088789394feaf88a98a7035b0e40d53dc9/pydantic_core-2.41.5-cp312-cp312-win_amd64.whl", hash = "sha256:1962293292865bca8e54702b08a4f26da73adc83dd1fcf26fbc875b35d81c815", size = 2020145, upload-time = "2025-11-04T13:40:21.548Z" }, + { url = "https://files.pythonhosted.org/packages/f9/22/91fbc821fa6d261b376a3f73809f907cec5ca6025642c463d3488aad22fb/pydantic_core-2.41.5-cp312-cp312-win_arm64.whl", hash = "sha256:1746d4a3d9a794cacae06a5eaaccb4b8643a131d45fbc9af23e353dc0a5ba5c3", size = 1976179, upload-time = "2025-11-04T13:40:23.393Z" }, + { url = "https://files.pythonhosted.org/packages/87/06/8806241ff1f70d9939f9af039c6c35f2360cf16e93c2ca76f184e76b1564/pydantic_core-2.41.5-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:941103c9be18ac8daf7b7adca8228f8ed6bb7a1849020f643b3a14d15b1924d9", size = 2120403, upload-time = "2025-11-04T13:40:25.248Z" }, + { url = "https://files.pythonhosted.org/packages/94/02/abfa0e0bda67faa65fef1c84971c7e45928e108fe24333c81f3bfe35d5f5/pydantic_core-2.41.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:112e305c3314f40c93998e567879e887a3160bb8689ef3d2c04b6cc62c33ac34", size = 1896206, upload-time = "2025-11-04T13:40:27.099Z" }, + { url = "https://files.pythonhosted.org/packages/15/df/a4c740c0943e93e6500f9eb23f4ca7ec9bf71b19e608ae5b579678c8d02f/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0cbaad15cb0c90aa221d43c00e77bb33c93e8d36e0bf74760cd00e732d10a6a0", size = 1919307, upload-time = "2025-11-04T13:40:29.806Z" }, + { url = "https://files.pythonhosted.org/packages/9a/e3/6324802931ae1d123528988e0e86587c2072ac2e5394b4bc2bc34b61ff6e/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:03ca43e12fab6023fc79d28ca6b39b05f794ad08ec2feccc59a339b02f2b3d33", size = 2063258, upload-time = "2025-11-04T13:40:33.544Z" }, + { url = "https://files.pythonhosted.org/packages/c9/d4/2230d7151d4957dd79c3044ea26346c148c98fbf0ee6ebd41056f2d62ab5/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dc799088c08fa04e43144b164feb0c13f9a0bc40503f8df3e9fde58a3c0c101e", size = 2214917, upload-time = "2025-11-04T13:40:35.479Z" }, + { url = "https://files.pythonhosted.org/packages/e6/9f/eaac5df17a3672fef0081b6c1bb0b82b33ee89aa5cec0d7b05f52fd4a1fa/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:97aeba56665b4c3235a0e52b2c2f5ae9cd071b8a8310ad27bddb3f7fb30e9aa2", size = 2332186, upload-time = "2025-11-04T13:40:37.436Z" }, + { url = "https://files.pythonhosted.org/packages/cf/4e/35a80cae583a37cf15604b44240e45c05e04e86f9cfd766623149297e971/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:406bf18d345822d6c21366031003612b9c77b3e29ffdb0f612367352aab7d586", size = 2073164, upload-time = "2025-11-04T13:40:40.289Z" }, + { url = "https://files.pythonhosted.org/packages/bf/e3/f6e262673c6140dd3305d144d032f7bd5f7497d3871c1428521f19f9efa2/pydantic_core-2.41.5-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b93590ae81f7010dbe380cdeab6f515902ebcbefe0b9327cc4804d74e93ae69d", size = 2179146, upload-time = "2025-11-04T13:40:42.809Z" }, + { url = "https://files.pythonhosted.org/packages/75/c7/20bd7fc05f0c6ea2056a4565c6f36f8968c0924f19b7d97bbfea55780e73/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:01a3d0ab748ee531f4ea6c3e48ad9dac84ddba4b0d82291f87248f2f9de8d740", size = 2137788, upload-time = "2025-11-04T13:40:44.752Z" }, + { url = "https://files.pythonhosted.org/packages/3a/8d/34318ef985c45196e004bc46c6eab2eda437e744c124ef0dbe1ff2c9d06b/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:6561e94ba9dacc9c61bce40e2d6bdc3bfaa0259d3ff36ace3b1e6901936d2e3e", size = 2340133, upload-time = "2025-11-04T13:40:46.66Z" }, + { url = "https://files.pythonhosted.org/packages/9c/59/013626bf8c78a5a5d9350d12e7697d3d4de951a75565496abd40ccd46bee/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:915c3d10f81bec3a74fbd4faebe8391013ba61e5a1a8d48c4455b923bdda7858", size = 2324852, upload-time = "2025-11-04T13:40:48.575Z" }, + { url = "https://files.pythonhosted.org/packages/1a/d9/c248c103856f807ef70c18a4f986693a46a8ffe1602e5d361485da502d20/pydantic_core-2.41.5-cp313-cp313-win32.whl", hash = "sha256:650ae77860b45cfa6e2cdafc42618ceafab3a2d9a3811fcfbd3bbf8ac3c40d36", size = 1994679, upload-time = "2025-11-04T13:40:50.619Z" }, + { url = "https://files.pythonhosted.org/packages/9e/8b/341991b158ddab181cff136acd2552c9f35bd30380422a639c0671e99a91/pydantic_core-2.41.5-cp313-cp313-win_amd64.whl", hash = "sha256:79ec52ec461e99e13791ec6508c722742ad745571f234ea6255bed38c6480f11", size = 2019766, upload-time = "2025-11-04T13:40:52.631Z" }, + { url = "https://files.pythonhosted.org/packages/73/7d/f2f9db34af103bea3e09735bb40b021788a5e834c81eedb541991badf8f5/pydantic_core-2.41.5-cp313-cp313-win_arm64.whl", hash = "sha256:3f84d5c1b4ab906093bdc1ff10484838aca54ef08de4afa9de0f5f14d69639cd", size = 1981005, upload-time = "2025-11-04T13:40:54.734Z" }, + { url = "https://files.pythonhosted.org/packages/ea/28/46b7c5c9635ae96ea0fbb779e271a38129df2550f763937659ee6c5dbc65/pydantic_core-2.41.5-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:3f37a19d7ebcdd20b96485056ba9e8b304e27d9904d233d7b1015db320e51f0a", size = 2119622, upload-time = "2025-11-04T13:40:56.68Z" }, + { url = "https://files.pythonhosted.org/packages/74/1a/145646e5687e8d9a1e8d09acb278c8535ebe9e972e1f162ed338a622f193/pydantic_core-2.41.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1d1d9764366c73f996edd17abb6d9d7649a7eb690006ab6adbda117717099b14", size = 1891725, upload-time = "2025-11-04T13:40:58.807Z" }, + { url = "https://files.pythonhosted.org/packages/23/04/e89c29e267b8060b40dca97bfc64a19b2a3cf99018167ea1677d96368273/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25e1c2af0fce638d5f1988b686f3b3ea8cd7de5f244ca147c777769e798a9cd1", size = 1915040, upload-time = "2025-11-04T13:41:00.853Z" }, + { url = "https://files.pythonhosted.org/packages/84/a3/15a82ac7bd97992a82257f777b3583d3e84bdb06ba6858f745daa2ec8a85/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:506d766a8727beef16b7adaeb8ee6217c64fc813646b424d0804d67c16eddb66", size = 2063691, upload-time = "2025-11-04T13:41:03.504Z" }, + { url = "https://files.pythonhosted.org/packages/74/9b/0046701313c6ef08c0c1cf0e028c67c770a4e1275ca73131563c5f2a310a/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4819fa52133c9aa3c387b3328f25c1facc356491e6135b459f1de698ff64d869", size = 2213897, upload-time = "2025-11-04T13:41:05.804Z" }, + { url = "https://files.pythonhosted.org/packages/8a/cd/6bac76ecd1b27e75a95ca3a9a559c643b3afcd2dd62086d4b7a32a18b169/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2b761d210c9ea91feda40d25b4efe82a1707da2ef62901466a42492c028553a2", size = 2333302, upload-time = "2025-11-04T13:41:07.809Z" }, + { url = "https://files.pythonhosted.org/packages/4c/d2/ef2074dc020dd6e109611a8be4449b98cd25e1b9b8a303c2f0fca2f2bcf7/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22f0fb8c1c583a3b6f24df2470833b40207e907b90c928cc8d3594b76f874375", size = 2064877, upload-time = "2025-11-04T13:41:09.827Z" }, + { url = "https://files.pythonhosted.org/packages/18/66/e9db17a9a763d72f03de903883c057b2592c09509ccfe468187f2a2eef29/pydantic_core-2.41.5-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2782c870e99878c634505236d81e5443092fba820f0373997ff75f90f68cd553", size = 2180680, upload-time = "2025-11-04T13:41:12.379Z" }, + { url = "https://files.pythonhosted.org/packages/d3/9e/3ce66cebb929f3ced22be85d4c2399b8e85b622db77dad36b73c5387f8f8/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:0177272f88ab8312479336e1d777f6b124537d47f2123f89cb37e0accea97f90", size = 2138960, upload-time = "2025-11-04T13:41:14.627Z" }, + { url = "https://files.pythonhosted.org/packages/a6/62/205a998f4327d2079326b01abee48e502ea739d174f0a89295c481a2272e/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_armv7l.whl", hash = "sha256:63510af5e38f8955b8ee5687740d6ebf7c2a0886d15a6d65c32814613681bc07", size = 2339102, upload-time = "2025-11-04T13:41:16.868Z" }, + { url = "https://files.pythonhosted.org/packages/3c/0d/f05e79471e889d74d3d88f5bd20d0ed189ad94c2423d81ff8d0000aab4ff/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:e56ba91f47764cc14f1daacd723e3e82d1a89d783f0f5afe9c364b8bb491ccdb", size = 2326039, upload-time = "2025-11-04T13:41:18.934Z" }, + { url = "https://files.pythonhosted.org/packages/ec/e1/e08a6208bb100da7e0c4b288eed624a703f4d129bde2da475721a80cab32/pydantic_core-2.41.5-cp314-cp314-win32.whl", hash = "sha256:aec5cf2fd867b4ff45b9959f8b20ea3993fc93e63c7363fe6851424c8a7e7c23", size = 1995126, upload-time = "2025-11-04T13:41:21.418Z" }, + { url = "https://files.pythonhosted.org/packages/48/5d/56ba7b24e9557f99c9237e29f5c09913c81eeb2f3217e40e922353668092/pydantic_core-2.41.5-cp314-cp314-win_amd64.whl", hash = "sha256:8e7c86f27c585ef37c35e56a96363ab8de4e549a95512445b85c96d3e2f7c1bf", size = 2015489, upload-time = "2025-11-04T13:41:24.076Z" }, + { url = "https://files.pythonhosted.org/packages/4e/bb/f7a190991ec9e3e0ba22e4993d8755bbc4a32925c0b5b42775c03e8148f9/pydantic_core-2.41.5-cp314-cp314-win_arm64.whl", hash = "sha256:e672ba74fbc2dc8eea59fb6d4aed6845e6905fc2a8afe93175d94a83ba2a01a0", size = 1977288, upload-time = "2025-11-04T13:41:26.33Z" }, + { url = "https://files.pythonhosted.org/packages/92/ed/77542d0c51538e32e15afe7899d79efce4b81eee631d99850edc2f5e9349/pydantic_core-2.41.5-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:8566def80554c3faa0e65ac30ab0932b9e3a5cd7f8323764303d468e5c37595a", size = 2120255, upload-time = "2025-11-04T13:41:28.569Z" }, + { url = "https://files.pythonhosted.org/packages/bb/3d/6913dde84d5be21e284439676168b28d8bbba5600d838b9dca99de0fad71/pydantic_core-2.41.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:b80aa5095cd3109962a298ce14110ae16b8c1aece8b72f9dafe81cf597ad80b3", size = 1863760, upload-time = "2025-11-04T13:41:31.055Z" }, + { url = "https://files.pythonhosted.org/packages/5a/f0/e5e6b99d4191da102f2b0eb9687aaa7f5bea5d9964071a84effc3e40f997/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3006c3dd9ba34b0c094c544c6006cc79e87d8612999f1a5d43b769b89181f23c", size = 1878092, upload-time = "2025-11-04T13:41:33.21Z" }, + { url = "https://files.pythonhosted.org/packages/71/48/36fb760642d568925953bcc8116455513d6e34c4beaa37544118c36aba6d/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:72f6c8b11857a856bcfa48c86f5368439f74453563f951e473514579d44aa612", size = 2053385, upload-time = "2025-11-04T13:41:35.508Z" }, + { url = "https://files.pythonhosted.org/packages/20/25/92dc684dd8eb75a234bc1c764b4210cf2646479d54b47bf46061657292a8/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5cb1b2f9742240e4bb26b652a5aeb840aa4b417c7748b6f8387927bc6e45e40d", size = 2218832, upload-time = "2025-11-04T13:41:37.732Z" }, + { url = "https://files.pythonhosted.org/packages/e2/09/f53e0b05023d3e30357d82eb35835d0f6340ca344720a4599cd663dca599/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bd3d54f38609ff308209bd43acea66061494157703364ae40c951f83ba99a1a9", size = 2327585, upload-time = "2025-11-04T13:41:40Z" }, + { url = "https://files.pythonhosted.org/packages/aa/4e/2ae1aa85d6af35a39b236b1b1641de73f5a6ac4d5a7509f77b814885760c/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ff4321e56e879ee8d2a879501c8e469414d948f4aba74a2d4593184eb326660", size = 2041078, upload-time = "2025-11-04T13:41:42.323Z" }, + { url = "https://files.pythonhosted.org/packages/cd/13/2e215f17f0ef326fc72afe94776edb77525142c693767fc347ed6288728d/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d0d2568a8c11bf8225044aa94409e21da0cb09dcdafe9ecd10250b2baad531a9", size = 2173914, upload-time = "2025-11-04T13:41:45.221Z" }, + { url = "https://files.pythonhosted.org/packages/02/7a/f999a6dcbcd0e5660bc348a3991c8915ce6599f4f2c6ac22f01d7a10816c/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:a39455728aabd58ceabb03c90e12f71fd30fa69615760a075b9fec596456ccc3", size = 2129560, upload-time = "2025-11-04T13:41:47.474Z" }, + { url = "https://files.pythonhosted.org/packages/3a/b1/6c990ac65e3b4c079a4fb9f5b05f5b013afa0f4ed6780a3dd236d2cbdc64/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_armv7l.whl", hash = "sha256:239edca560d05757817c13dc17c50766136d21f7cd0fac50295499ae24f90fdf", size = 2329244, upload-time = "2025-11-04T13:41:49.992Z" }, + { url = "https://files.pythonhosted.org/packages/d9/02/3c562f3a51afd4d88fff8dffb1771b30cfdfd79befd9883ee094f5b6c0d8/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:2a5e06546e19f24c6a96a129142a75cee553cc018ffee48a460059b1185f4470", size = 2331955, upload-time = "2025-11-04T13:41:54.079Z" }, + { url = "https://files.pythonhosted.org/packages/5c/96/5fb7d8c3c17bc8c62fdb031c47d77a1af698f1d7a406b0f79aaa1338f9ad/pydantic_core-2.41.5-cp314-cp314t-win32.whl", hash = "sha256:b4ececa40ac28afa90871c2cc2b9ffd2ff0bf749380fbdf57d165fd23da353aa", size = 1988906, upload-time = "2025-11-04T13:41:56.606Z" }, + { url = "https://files.pythonhosted.org/packages/22/ed/182129d83032702912c2e2d8bbe33c036f342cc735737064668585dac28f/pydantic_core-2.41.5-cp314-cp314t-win_amd64.whl", hash = "sha256:80aa89cad80b32a912a65332f64a4450ed00966111b6615ca6816153d3585a8c", size = 1981607, upload-time = "2025-11-04T13:41:58.889Z" }, + { url = "https://files.pythonhosted.org/packages/9f/ed/068e41660b832bb0b1aa5b58011dea2a3fe0ba7861ff38c4d4904c1c1a99/pydantic_core-2.41.5-cp314-cp314t-win_arm64.whl", hash = "sha256:35b44f37a3199f771c3eaa53051bc8a70cd7b54f333531c59e29fd4db5d15008", size = 1974769, upload-time = "2025-11-04T13:42:01.186Z" }, + { url = "https://files.pythonhosted.org/packages/11/72/90fda5ee3b97e51c494938a4a44c3a35a9c96c19bba12372fb9c634d6f57/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:b96d5f26b05d03cc60f11a7761a5ded1741da411e7fe0909e27a5e6a0cb7b034", size = 2115441, upload-time = "2025-11-04T13:42:39.557Z" }, + { url = "https://files.pythonhosted.org/packages/1f/53/8942f884fa33f50794f119012dc6a1a02ac43a56407adaac20463df8e98f/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:634e8609e89ceecea15e2d61bc9ac3718caaaa71963717bf3c8f38bfde64242c", size = 1930291, upload-time = "2025-11-04T13:42:42.169Z" }, + { url = "https://files.pythonhosted.org/packages/79/c8/ecb9ed9cd942bce09fc888ee960b52654fbdbede4ba6c2d6e0d3b1d8b49c/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:93e8740d7503eb008aa2df04d3b9735f845d43ae845e6dcd2be0b55a2da43cd2", size = 1948632, upload-time = "2025-11-04T13:42:44.564Z" }, + { url = "https://files.pythonhosted.org/packages/2e/1b/687711069de7efa6af934e74f601e2a4307365e8fdc404703afc453eab26/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f15489ba13d61f670dcc96772e733aad1a6f9c429cc27574c6cdaed82d0146ad", size = 2138905, upload-time = "2025-11-04T13:42:47.156Z" }, + { url = "https://files.pythonhosted.org/packages/09/32/59b0c7e63e277fa7911c2fc70ccfb45ce4b98991e7ef37110663437005af/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:7da7087d756b19037bc2c06edc6c170eeef3c3bafcb8f532ff17d64dc427adfd", size = 2110495, upload-time = "2025-11-04T13:42:49.689Z" }, + { url = "https://files.pythonhosted.org/packages/aa/81/05e400037eaf55ad400bcd318c05bb345b57e708887f07ddb2d20e3f0e98/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:aabf5777b5c8ca26f7824cb4a120a740c9588ed58df9b2d196ce92fba42ff8dc", size = 1915388, upload-time = "2025-11-04T13:42:52.215Z" }, + { url = "https://files.pythonhosted.org/packages/6e/0d/e3549b2399f71d56476b77dbf3cf8937cec5cd70536bdc0e374a421d0599/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c007fe8a43d43b3969e8469004e9845944f1a80e6acd47c150856bb87f230c56", size = 1942879, upload-time = "2025-11-04T13:42:56.483Z" }, + { url = "https://files.pythonhosted.org/packages/f7/07/34573da085946b6a313d7c42f82f16e8920bfd730665de2d11c0c37a74b5/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:76d0819de158cd855d1cbb8fcafdf6f5cf1eb8e470abe056d5d161106e38062b", size = 2139017, upload-time = "2025-11-04T13:42:59.471Z" }, + { url = "https://files.pythonhosted.org/packages/e6/b0/1a2aa41e3b5a4ba11420aba2d091b2d17959c8d1519ece3627c371951e73/pydantic_core-2.41.5-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:b5819cd790dbf0c5eb9f82c73c16b39a65dd6dd4d1439dcdea7816ec9adddab8", size = 2103351, upload-time = "2025-11-04T13:43:02.058Z" }, + { url = "https://files.pythonhosted.org/packages/a4/ee/31b1f0020baaf6d091c87900ae05c6aeae101fa4e188e1613c80e4f1ea31/pydantic_core-2.41.5-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:5a4e67afbc95fa5c34cf27d9089bca7fcab4e51e57278d710320a70b956d1b9a", size = 1925363, upload-time = "2025-11-04T13:43:05.159Z" }, + { url = "https://files.pythonhosted.org/packages/e1/89/ab8e86208467e467a80deaca4e434adac37b10a9d134cd2f99b28a01e483/pydantic_core-2.41.5-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ece5c59f0ce7d001e017643d8d24da587ea1f74f6993467d85ae8a5ef9d4f42b", size = 2135615, upload-time = "2025-11-04T13:43:08.116Z" }, + { url = "https://files.pythonhosted.org/packages/99/0a/99a53d06dd0348b2008f2f30884b34719c323f16c3be4e6cc1203b74a91d/pydantic_core-2.41.5-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:16f80f7abe3351f8ea6858914ddc8c77e02578544a0ebc15b4c2e1a0e813b0b2", size = 2175369, upload-time = "2025-11-04T13:43:12.49Z" }, + { url = "https://files.pythonhosted.org/packages/6d/94/30ca3b73c6d485b9bb0bc66e611cff4a7138ff9736b7e66bcf0852151636/pydantic_core-2.41.5-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:33cb885e759a705b426baada1fe68cbb0a2e68e34c5d0d0289a364cf01709093", size = 2144218, upload-time = "2025-11-04T13:43:15.431Z" }, + { url = "https://files.pythonhosted.org/packages/87/57/31b4f8e12680b739a91f472b5671294236b82586889ef764b5fbc6669238/pydantic_core-2.41.5-pp310-pypy310_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:c8d8b4eb992936023be7dee581270af5c6e0697a8559895f527f5b7105ecd36a", size = 2329951, upload-time = "2025-11-04T13:43:18.062Z" }, + { url = "https://files.pythonhosted.org/packages/7d/73/3c2c8edef77b8f7310e6fb012dbc4b8551386ed575b9eb6fb2506e28a7eb/pydantic_core-2.41.5-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:242a206cd0318f95cd21bdacff3fcc3aab23e79bba5cac3db5a841c9ef9c6963", size = 2318428, upload-time = "2025-11-04T13:43:20.679Z" }, + { url = "https://files.pythonhosted.org/packages/2f/02/8559b1f26ee0d502c74f9cca5c0d2fd97e967e083e006bbbb4e97f3a043a/pydantic_core-2.41.5-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d3a978c4f57a597908b7e697229d996d77a6d3c94901e9edee593adada95ce1a", size = 2147009, upload-time = "2025-11-04T13:43:23.286Z" }, + { url = "https://files.pythonhosted.org/packages/5f/9b/1b3f0e9f9305839d7e84912f9e8bfbd191ed1b1ef48083609f0dabde978c/pydantic_core-2.41.5-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:b2379fa7ed44ddecb5bfe4e48577d752db9fc10be00a6b7446e9663ba143de26", size = 2101980, upload-time = "2025-11-04T13:43:25.97Z" }, + { url = "https://files.pythonhosted.org/packages/a4/ed/d71fefcb4263df0da6a85b5d8a7508360f2f2e9b3bf5814be9c8bccdccc1/pydantic_core-2.41.5-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:266fb4cbf5e3cbd0b53669a6d1b039c45e3ce651fd5442eff4d07c2cc8d66808", size = 1923865, upload-time = "2025-11-04T13:43:28.763Z" }, + { url = "https://files.pythonhosted.org/packages/ce/3a/626b38db460d675f873e4444b4bb030453bbe7b4ba55df821d026a0493c4/pydantic_core-2.41.5-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58133647260ea01e4d0500089a8c4f07bd7aa6ce109682b1426394988d8aaacc", size = 2134256, upload-time = "2025-11-04T13:43:31.71Z" }, + { url = "https://files.pythonhosted.org/packages/83/d9/8412d7f06f616bbc053d30cb4e5f76786af3221462ad5eee1f202021eb4e/pydantic_core-2.41.5-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:287dad91cfb551c363dc62899a80e9e14da1f0e2b6ebde82c806612ca2a13ef1", size = 2174762, upload-time = "2025-11-04T13:43:34.744Z" }, + { url = "https://files.pythonhosted.org/packages/55/4c/162d906b8e3ba3a99354e20faa1b49a85206c47de97a639510a0e673f5da/pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:03b77d184b9eb40240ae9fd676ca364ce1085f203e1b1256f8ab9984dca80a84", size = 2143141, upload-time = "2025-11-04T13:43:37.701Z" }, + { url = "https://files.pythonhosted.org/packages/1f/f2/f11dd73284122713f5f89fc940f370d035fa8e1e078d446b3313955157fe/pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:a668ce24de96165bb239160b3d854943128f4334822900534f2fe947930e5770", size = 2330317, upload-time = "2025-11-04T13:43:40.406Z" }, + { url = "https://files.pythonhosted.org/packages/88/9d/b06ca6acfe4abb296110fb1273a4d848a0bfb2ff65f3ee92127b3244e16b/pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f14f8f046c14563f8eb3f45f499cc658ab8d10072961e07225e507adb700e93f", size = 2316992, upload-time = "2025-11-04T13:43:43.602Z" }, + { url = "https://files.pythonhosted.org/packages/36/c7/cfc8e811f061c841d7990b0201912c3556bfeb99cdcb7ed24adc8d6f8704/pydantic_core-2.41.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:56121965f7a4dc965bff783d70b907ddf3d57f6eba29b6d2e5dabfaf07799c51", size = 2145302, upload-time = "2025-11-04T13:43:46.64Z" }, +] + +[[package]] +name = "pydantic-evals" +version = "1.73.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "logfire-api" }, + { name = "pydantic" }, + { name = "pydantic-ai-slim" }, + { name = "pyyaml" }, + { name = "rich" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/02/45/ce1f9b97c4838f940c98693bc1d6298f0e1396355998942b095ce17157fe/pydantic_evals-1.73.0.tar.gz", hash = "sha256:c1f38ad9c4f566bee6958c92f205b8200957b4baf3dd5239e2a4a06edd28e3dc", size = 56137, upload-time = "2026-03-27T03:49:50.861Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/01/4e/aefc34a68adc165ddec22c0632cb3076579c46751ac11acdf8cec6462891/pydantic_evals-1.73.0-py3-none-any.whl", hash = "sha256:0609210d4825cc8339b5cb649be38321450b46d6e87d72c1ffde73598741fd5a", size = 67143, upload-time = "2026-03-27T03:49:44.298Z" }, +] + +[[package]] +name = "pydantic-graph" +version = "1.73.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "httpx" }, + { name = "logfire-api" }, + { name = "pydantic" }, + { name = "typing-inspection" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/1a/22/d479ea32e3c712c6711e41157fb975d81582e5171510e4c662f21a85e9fe/pydantic_graph-1.73.0.tar.gz", hash = "sha256:f0d3e4984af1d902cdda1ccd3fcd86949d45d3ed21559e781f7cf9eace2ed914", size = 58717, upload-time = "2026-03-27T03:49:51.967Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/08/b3/4cc0b1c543b8a0c1f9add7bdeb2e8cd583961a795664a1a74d1fc8200416/pydantic_graph-1.73.0-py3-none-any.whl", hash = "sha256:aaab8b1580885f5108401db0a7da58d6c7643e467eb626b8a1364b1030327de0", size = 72504, upload-time = "2026-03-27T03:49:45.668Z" }, +] + [[package]] name = "pygments" version = "2.19.2" @@ -1116,6 +1420,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" }, ] +[[package]] +name = "typing-inspection" +version = "0.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/55/e3/70399cb7dd41c10ac53367ae42139cf4b1ca5f36bb3dc6c9d33acdb43655/typing_inspection-0.4.2.tar.gz", hash = "sha256:ba561c48a67c5958007083d386c3295464928b01faa735ab8547c5692e87f464", size = 75949, upload-time = "2025-10-01T02:14:41.687Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" }, +] + [[package]] name = "urllib3" version = "2.5.0" @@ -1230,3 +1546,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/68/a1/dcb68430b1d00b698ae7a7e0194433bce4f07ded185f0ee5fb21e2a2e91e/websockets-15.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:cad21560da69f4ce7658ca2cb83138fb4cf695a2ba3e475e0559e05991aa8122", size = 176884, upload-time = "2025-03-05T20:03:27.934Z" }, { url = "https://files.pythonhosted.org/packages/fa/a8/5b41e0da817d64113292ab1f8247140aac61cbf6cfd085d6a0fa77f4984f/websockets-15.0.1-py3-none-any.whl", hash = "sha256:f7a866fbc1e97b5c617ee4116daaa09b722101d4a3c170c787450ba409f9736f", size = 169743, upload-time = "2025-03-05T20:03:39.41Z" }, ] + +[[package]] +name = "zipp" +version = "3.23.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e3/02/0f2892c661036d50ede074e376733dca2ae7c6eb617489437771209d4180/zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166", size = 25547, upload-time = "2025-06-08T17:06:39.4Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e", size = 10276, upload-time = "2025-06-08T17:06:38.034Z" }, +] From 7aa3b49a0f076ad768706aef4e896bb390f5d1d9 Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Mon, 30 Mar 2026 07:30:00 +0200 Subject: [PATCH 06/60] =?UTF-8?q?feat(evals):=20ShortCircuit=20=E2=80=94?= =?UTF-8?q?=20skip=20expensive=20evaluators=20on=20early=20fail?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit evaluators=[ not_empty, ShortCircuit([ contains_expected_facts(min_score=0.5), llm_judge(rubric="..."), # skipped if above fails ]), ] First Verdict=False stops the group. Evaluators outside run regardless. --- docs/evals.md | 18 ++++++++ protest/core/execution/test_executor.py | 13 +++--- protest/entities/events.py | 1 + protest/evals/__init__.py | 3 +- protest/evals/evaluator.py | 21 +++++++++ protest/evals/types.py | 11 +++-- protest/evals/wrapper.py | 32 ++++++++++++- protest/reporting/rich_reporter.py | 3 ++ tests/evals/test_e2e.py | 61 +++++++++++++++++++++++++ 9 files changed, 151 insertions(+), 12 deletions(-) diff --git a/docs/evals.md b/docs/evals.md index 1ff3235..b8cd74b 100644 --- a/docs/evals.md +++ b/docs/evals.md @@ -185,6 +185,24 @@ EvalCase(inputs="easy lookup", evaluators=[keyword_check(keywords=["paris"], min EvalCase(inputs="hard causal", evaluators=[keyword_check(keywords=["paris"], min_recall=0.3)]), ``` +### ShortCircuit + +Skip expensive evaluators (LLM judges) when cheap ones already fail: + +```python +from protest.evals import ShortCircuit + +evaluators=[ + not_empty, # always runs + ShortCircuit([ + contains_expected_facts(min_score=0.3), # 0ms — if fail → stop + llm_judge(rubric="factual accuracy"), # 3s — skipped if above fails + ]), +] +``` + +`ShortCircuit` is a group of ordered evaluators. The first `Verdict=False` stops the group. Evaluators outside the `ShortCircuit` always run. + ### Using Evaluators ```python diff --git a/protest/core/execution/test_executor.py b/protest/core/execution/test_executor.py index 8b475c6..2e7a9c4 100644 --- a/protest/core/execution/test_executor.py +++ b/protest/core/execution/test_executor.py @@ -361,9 +361,10 @@ async def _acquire_fixture_semaphores( def _build_eval_error(payload: EvalPayload) -> AssertionError: """Build a descriptive AssertionError from failed eval scores.""" - failed = [ - f"{name}={entry.value}" - for name, entry in payload.scores.items() - if not entry.passed - ] - return AssertionError(f"{', '.join(failed)}") + parts = [] + for name, entry in payload.scores.items(): + if entry.skipped: + parts.append(f"{name}=⊘") + elif not entry.passed: + parts.append(f"{name}={entry.value}") + return AssertionError(f"{', '.join(parts)}") diff --git a/protest/entities/events.py b/protest/entities/events.py index d76434c..afb8971 100644 --- a/protest/entities/events.py +++ b/protest/entities/events.py @@ -14,6 +14,7 @@ class EvalScoreEntry: value: float | bool | str passed: bool = True + skipped: bool = False @dataclass(frozen=True, slots=True) diff --git a/protest/evals/__init__.py b/protest/evals/__init__.py index 17b35c9..54f5ef6 100644 --- a/protest/evals/__init__.py +++ b/protest/evals/__init__.py @@ -1,6 +1,6 @@ """ProTest evals — native eval support.""" -from protest.evals.evaluator import EvalCase, EvalContext, Metric, Reason, Verdict, evaluator +from protest.evals.evaluator import EvalCase, EvalContext, Metric, Reason, ShortCircuit, Verdict, evaluator from protest.evals.session import EvalSession from protest.evals.types import ( EvalCaseResult, @@ -23,6 +23,7 @@ "ModelInfo", "Reason", "ScoreStats", + "ShortCircuit", "Verdict", "evaluator", ] diff --git a/protest/evals/evaluator.py b/protest/evals/evaluator.py index 336df8d..cd8a615 100644 --- a/protest/evals/evaluator.py +++ b/protest/evals/evaluator.py @@ -80,6 +80,27 @@ def __repr__(self) -> str: return self.name or f"EvalCase({self.inputs!r})" +class ShortCircuit: + """Group evaluators with fail-fast behavior. + + The first Verdict=False stops the group. Evaluators outside + the ShortCircuit run regardless. + + Usage:: + + evaluators=[ + not_empty, + ShortCircuit([ + contains_expected_facts(min_score=0.5), + llm_judge(rubric="..."), # skipped if above fails + ]), + ] + """ + + def __init__(self, evaluators: list[Any]) -> None: + self.evaluators = evaluators + + class Metric: """Annotate a float/int field as a metric for stats aggregation.""" diff --git a/protest/evals/types.py b/protest/evals/types.py index 24082f1..ac61181 100644 --- a/protest/evals/types.py +++ b/protest/evals/types.py @@ -53,24 +53,27 @@ class EvalScore: name: str value: float | bool | str + skipped: bool = False @property def is_verdict(self) -> bool: - return isinstance(self.value, bool) + return not self.skipped and isinstance(self.value, bool) @property def is_metric(self) -> bool: - return isinstance(self.value, (int, float)) and not isinstance(self.value, bool) + return not self.skipped and isinstance(self.value, (int, float)) and not isinstance(self.value, bool) @property def is_reason(self) -> bool: - return isinstance(self.value, str) + return not self.skipped and isinstance(self.value, str) @property def passed(self) -> bool: + if self.skipped: + return True # skipped scores don't affect pass/fail if isinstance(self.value, bool): return self.value - return True # metrics and reasons always "pass" + return True @dataclass(frozen=True, slots=True) diff --git a/protest/evals/wrapper.py b/protest/evals/wrapper.py index c9087b6..0251f98 100644 --- a/protest/evals/wrapper.py +++ b/protest/evals/wrapper.py @@ -13,7 +13,7 @@ from typing import Any from protest.entities.events import EvalPayload, EvalScoreEntry -from protest.evals.evaluator import EvalContext, extract_scores_from_result +from protest.evals.evaluator import EvalContext, ShortCircuit, extract_scores_from_result from protest.evals.types import EvalScore @@ -65,6 +65,7 @@ async def eval_wrapper(**kwargs: Any) -> EvalPayload: s.name: EvalScoreEntry( value=s.value, passed=s.passed, + skipped=s.skipped, ) for s in scores }, @@ -163,6 +164,10 @@ async def run_evaluators( scores: list[EvalScore] = [] for ev in evaluators: + if isinstance(ev, ShortCircuit): + scores.extend(await _run_short_circuit(ev.evaluators, ctx)) + continue + evaluator_name = getattr(ev, "__name__", type(ev).__name__) try: raw = ev(ctx) @@ -174,3 +179,28 @@ async def run_evaluators( raise FixtureError(f"evaluator '{evaluator_name}'", exc) from exc return scores + + +async def _run_short_circuit( + evaluators: list[Any], ctx: EvalContext[Any, Any], +) -> list[EvalScore]: + """Run evaluators in order, stop at first Verdict=False.""" + scores: list[EvalScore] = [] + for i, ev in enumerate(evaluators): + evaluator_name = getattr(ev, "__name__", type(ev).__name__) + try: + raw = ev(ctx) + result = await raw if asyncio.iscoroutine(raw) else raw + except Exception as exc: + from protest.exceptions import FixtureError + + raise FixtureError(f"evaluator '{evaluator_name}'", exc) from exc + extracted = extract_scores_from_result(result, evaluator_name) + scores.extend(extracted) + if any(s.is_verdict and not s.passed for s in extracted): + # Mark remaining evaluators as skipped + for skipped_ev in evaluators[i + 1 :]: + skipped_name = getattr(skipped_ev, "__name__", type(skipped_ev).__name__) + scores.append(EvalScore(name=skipped_name, value=False, skipped=True)) + break + return scores diff --git a/protest/reporting/rich_reporter.py b/protest/reporting/rich_reporter.py index 8f263d9..414cb49 100644 --- a/protest/reporting/rich_reporter.py +++ b/protest/reporting/rich_reporter.py @@ -55,6 +55,9 @@ def _format_eval_scores_inline(result: TestResult) -> str: return "" parts = [] for name, entry in result.eval_payload.scores.items(): + if entry.skipped: + parts.append(f"{name}=⊘") + continue val = entry.value if isinstance(val, bool): parts.append(f"{name}={'✓' if val else '✗'}") diff --git a/tests/evals/test_e2e.py b/tests/evals/test_e2e.py index 5fbb4e8..6e35762 100644 --- a/tests/evals/test_e2e.py +++ b/tests/evals/test_e2e.py @@ -866,6 +866,67 @@ def eval_echo(case: Annotated[dict, From(single_case)]) -> str: assert results[0].is_fixture_error is True +class TestShortCircuit: + """ShortCircuit: skip expensive evaluators when cheap ones fail.""" + + def test_short_circuit_skips_on_fail(self) -> None: + from protest.evals import ShortCircuit + + call_log: list[str] = [] + + @evaluator + def cheap(ctx: EvalContext) -> bool: + call_log.append("cheap") + return "hello" in ctx.output.lower() + + @evaluator + def expensive(ctx: EvalContext) -> bool: + call_log.append("expensive") + return True + + session = EvalSession() + + @session.eval(evaluators=[ShortCircuit([cheap, expensive])]) + def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: + return echo_task(case["inputs"]) + + runner = TestRunner(session) + runner.run() + + # case_pass: cheap ✓ → expensive ✓ (both called) + # case_fail: cheap ✗ → expensive SKIPPED + assert call_log.count("cheap") == 2 + assert call_log.count("expensive") == 1 + + def test_short_circuit_all_pass(self) -> None: + from protest.evals import ShortCircuit + + call_log: list[str] = [] + + @evaluator + def check_a(ctx: EvalContext) -> bool: + call_log.append("a") + return True + + @evaluator + def check_b(ctx: EvalContext) -> bool: + call_log.append("b") + return True + + single = ForEach([{"inputs": "x", "expected": "x", "name": "c1"}], ids=lambda c: c["name"]) + session = EvalSession() + + @session.eval(evaluators=[ShortCircuit([check_a, check_b])]) + def eval_echo(case: Annotated[dict, From(single)]) -> str: + return echo_task(case["inputs"]) + + runner = TestRunner(session) + result = runner.run() + + assert result.success is True + assert call_log == ["a", "b"] + + # --------------------------------------------------------------------------- # Results files per run # --------------------------------------------------------------------------- From 3ed68a4aab8764a88048b434c92c6679ac153012 Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Mon, 30 Mar 2026 22:14:03 +0200 Subject: [PATCH 07/60] fix ci --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a402b70..75efa11 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -103,7 +103,7 @@ jobs: files: coverage.xml fail_ci_if_error: false -c docs: + docs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 From ad7a20714841cf1744b4a92bd2e3031e63748ea5 Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Mon, 30 Mar 2026 07:45:00 +0200 Subject: [PATCH 08/60] =?UTF-8?q?chore:=20fix=20all=20lint=20=E2=80=94=20m?= =?UTF-8?q?ove=20imports=20to=20top-level,=20no=20lazy=20imports?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- protest/__init__.py | 1 + protest/console.py | 14 ++-- protest/core/collector.py | 3 +- protest/core/execution/test_executor.py | 3 +- protest/core/runner.py | 14 ++-- protest/core/suite.py | 2 +- protest/di/container.py | 3 +- protest/di/hints.py | 13 +-- protest/di/validation.py | 3 +- protest/evals/__init__.py | 25 +++++- protest/evals/evaluator.py | 20 ++--- protest/evals/evaluators.py | 4 +- protest/evals/results_writer.py | 5 +- protest/evals/types.py | 6 +- protest/evals/wrapper.py | 13 ++- protest/reporting/ascii.py | 11 +-- protest/reporting/rich_reporter.py | 30 ++++--- tests/evals/test_e2e.py | 100 ++++++------------------ uv.lock | 2 +- 19 files changed, 115 insertions(+), 157 deletions(-) diff --git a/protest/__init__.py b/protest/__init__.py index 4509b37..97221b9 100644 --- a/protest/__init__.py +++ b/protest/__init__.py @@ -42,6 +42,7 @@ "__version__", "caplog", "collect_tests", + "console", "factory", "fixture", "list_tags", diff --git a/protest/console.py b/protest/console.py index 9270c16..29dd381 100644 --- a/protest/console.py +++ b/protest/console.py @@ -19,9 +19,13 @@ async def pipeline(): from __future__ import annotations +import contextlib import re import sys +from protest.events.types import Event +from protest.execution.capture import get_event_bus + def print(msg: str, *, raw: bool = False) -> None: """Print a message that bypasses test capture. @@ -33,22 +37,16 @@ def print(msg: str, *, raw: bool = False) -> None: msg: The message to print. Supports Rich markup unless raw=True. raw: If True, no markup processing — message passed as-is. """ - from protest.execution.capture import get_event_bus - bus = get_event_bus() if bus is None: _fallback_print(msg, raw) return - from protest.events.types import Event - # Call handlers directly (sync, bypasses async emit). # This ensures messages appear immediately, not after the test. - for handler_entry in bus._handlers.get(Event.USER_PRINT, []): - try: + for handler_entry in bus._handlers.get(Event.USER_PRINT, []): # type: ignore[union-attr] + with contextlib.suppress(Exception): handler_entry.func((msg, raw)) - except Exception: - pass def _fallback_print(msg: str, raw: bool) -> None: diff --git a/protest/core/collector.py b/protest/core/collector.py index 24356a8..d7c83db 100644 --- a/protest/core/collector.py +++ b/protest/core/collector.py @@ -5,6 +5,7 @@ from typing import TYPE_CHECKING, Annotated, Any, get_args, get_origin from protest.di.decorators import get_fixture_marker, unwrap_fixture +from protest.di.hints import get_type_hints_compat from protest.di.markers import Use from protest.di.validation import _extract_from_params from protest.entities import FixtureCallable, SuitePath, TestItem, TestRegistration @@ -18,8 +19,6 @@ def _extract_use_fixtures(func: Callable[..., Any]) -> list[FixtureCallable]: """Extract fixtures referenced via Use() markers in function parameters.""" - from protest.di.hints import get_type_hints_compat - type_hints = get_type_hints_compat(func) fixtures: list[FixtureCallable] = [] diff --git a/protest/core/execution/test_executor.py b/protest/core/execution/test_executor.py index 2e7a9c4..3c065f2 100644 --- a/protest/core/execution/test_executor.py +++ b/protest/core/execution/test_executor.py @@ -12,6 +12,7 @@ from protest.core.collector import get_transitive_fixtures from protest.core.outcome import OutcomeBuilder, TestExecutionResult from protest.di.container import FixtureContainer +from protest.di.hints import get_type_hints_compat from protest.entities import ( FixtureCallable, TestItem, @@ -255,8 +256,6 @@ async def _resolve_test_kwargs( func_signature = signature(item.func) kwargs: dict[str, Any] = dict(item.case_kwargs) - from protest.di.hints import get_type_hints_compat - type_hints = get_type_hints_compat(item.func) for param_name, param in func_signature.parameters.items(): diff --git a/protest/core/runner.py b/protest/core/runner.py index 70669d0..4e58544 100644 --- a/protest/core/runner.py +++ b/protest/core/runner.py @@ -9,7 +9,7 @@ from protest.core.collector import Collector from protest.core.execution import ParallelExecutor, SuiteManager, TestExecutor from protest.core.outcome import OutcomeBuilder -from protest.core.session import ProTestSession +from protest.core.session import ProTestSession # noqa: TC001 — used at runtime from protest.core.tracker import SuiteTracker from protest.entities import ( RunResult, @@ -17,9 +17,12 @@ SessionSetupInfo, TestCounts, ) +from protest.evals.types import EvalCaseResult, EvalScore, EvalSuiteReport from protest.events.types import Event from protest.execution.capture import ( GlobalCapturePatch, + reset_event_bus, + set_event_bus, set_session_setup_capture, ) from protest.execution.context import cancellation_event @@ -27,7 +30,6 @@ if TYPE_CHECKING: from protest.entities.events import TestResult - from protest.evals.types import EvalCaseResult class TestRunner: @@ -77,7 +79,7 @@ def _collect_eval_result(self, result: TestResult) -> None: case_result = _build_eval_case_result(result) self._eval_results.setdefault(suite_name, []).append(case_result) - async def _main_loop(self) -> bool: + async def _main_loop(self) -> bool: # noqa: PLR0915 """The main async loop for running tests.""" session_start = time.perf_counter() @@ -100,8 +102,6 @@ async def _main_loop(self) -> bool: total_counts = TestCounts() # Inject cancellation event into context for teardown awareness - from protest.execution.capture import reset_event_bus, set_event_bus - cancel_token = cancellation_event.set( self._interrupt_handler.force_teardown_event ) @@ -190,8 +190,6 @@ async def _main_loop(self) -> bool: async def _emit_eval_suite_end(self, suite_path: Any) -> None: """Emit EVAL_SUITE_END if this suite_path corresponds to an eval suite.""" - from protest.evals.types import EvalSuiteReport - suite_name = ( suite_path.root_name if hasattr(suite_path, "root_name") @@ -210,8 +208,6 @@ async def _emit_eval_suite_end(self, suite_path: Any) -> None: def _build_eval_case_result(result: TestResult) -> EvalCaseResult: """Build EvalCaseResult from a TestResult with eval_payload.""" - from protest.evals.types import EvalCaseResult, EvalScore - payload = result.eval_payload assert payload is not None return EvalCaseResult( diff --git a/protest/core/suite.py b/protest/core/suite.py index dfb64c3..1a8da5d 100644 --- a/protest/core/suite.py +++ b/protest/core/suite.py @@ -21,6 +21,7 @@ normalize_skip, normalize_xfail, ) +from protest.evals.wrapper import make_eval_wrapper from protest.exceptions import ConcurrencyMismatchError, InvalidMaxConcurrencyError FuncT = TypeVar("FuncT", bound="Callable[..., object]") @@ -167,7 +168,6 @@ def eval( timeout: float | None = None, ) -> Callable[[FuncT], FuncT]: """Register a scored eval test on this suite.""" - from protest.evals.wrapper import make_eval_wrapper def decorator(func: FuncT) -> FuncT: wrapper = make_eval_wrapper( diff --git a/protest/di/container.py b/protest/di/container.py index 5c38571..3a85ae0 100644 --- a/protest/di/container.py +++ b/protest/di/container.py @@ -22,6 +22,7 @@ unwrap_fixture, ) from protest.di.factory import FixtureFactory +from protest.di.hints import get_type_hints_compat from protest.di.markers import Use from protest.di.proxy import FixtureErrorWrapper from protest.entities import ( @@ -780,8 +781,6 @@ def _analyze_and_store_dependencies( actual_func = unwrap_fixture(func) func_signature = signature(actual_func) - from protest.di.hints import get_type_hints_compat - type_hints = get_type_hints_compat(actual_func) dependencies: dict[str, FixtureCallable] = {} diff --git a/protest/di/hints.py b/protest/di/hints.py index ede4c12..bd6a89b 100644 --- a/protest/di/hints.py +++ b/protest/di/hints.py @@ -14,6 +14,7 @@ from __future__ import annotations +import contextlib import inspect import re from typing import Any, get_type_hints @@ -21,23 +22,17 @@ def get_type_hints_compat(func: Any) -> dict[str, Any]: """Resolve type hints with PEP 563 / TYPE_CHECKING fallbacks.""" - try: + with contextlib.suppress(Exception): return get_type_hints(func, include_extras=True) - except Exception: - pass # Build a namespace from the entire call stack (covers local fixtures). localns: dict[str, Any] = {} - try: + with contextlib.suppress(Exception): for frame_info in inspect.stack(): localns.update(frame_info.frame.f_locals) - except Exception: - pass - try: + with contextlib.suppress(Exception): return get_type_hints(func, localns=localns, include_extras=True) - except Exception: - pass # TYPE_CHECKING fallback: substitute Any for unresolvable names. return _get_type_hints_substituting_any(func, localns) diff --git a/protest/di/validation.py b/protest/di/validation.py index d716397..1026bca 100644 --- a/protest/di/validation.py +++ b/protest/di/validation.py @@ -5,6 +5,7 @@ from inspect import signature from typing import TYPE_CHECKING, Annotated, Any, get_args, get_origin +from protest.di.hints import get_type_hints_compat from protest.di.markers import ForEach, From from protest.exceptions import ParameterizedFixtureError from protest.utils import get_callable_name @@ -15,8 +16,6 @@ def _extract_from_params(func: Callable[..., Any]) -> dict[str, ForEach[Any]]: """Extract parameters annotated with From(source).""" - from protest.di.hints import get_type_hints_compat - type_hints = get_type_hints_compat(func) result: dict[str, ForEach[Any]] = {} diff --git a/protest/evals/__init__.py b/protest/evals/__init__.py index 54f5ef6..fdb5115 100644 --- a/protest/evals/__init__.py +++ b/protest/evals/__init__.py @@ -1,7 +1,14 @@ """ProTest evals — native eval support.""" -from protest.evals.evaluator import EvalCase, EvalContext, Metric, Reason, ShortCircuit, Verdict, evaluator -from protest.evals.session import EvalSession +from protest.evals.evaluator import ( + EvalCase, + EvalContext, + Metric, + Reason, + ShortCircuit, + Verdict, + evaluator, +) from protest.evals.types import ( EvalCaseResult, EvalScore, @@ -15,11 +22,11 @@ "EvalCase", "EvalCaseResult", "EvalContext", - "Metric", "EvalScore", "EvalSession", "EvalSuiteReport", "JudgeInfo", + "Metric", "ModelInfo", "Reason", "ScoreStats", @@ -27,3 +34,15 @@ "Verdict", "evaluator", ] + + +def __getattr__(name: str) -> object: + # EvalSession imports protest.core.session which imports reporters, + # and reporters import protest.evals.types — eagerly importing + # EvalSession here would create a circular import chain. + if name == "EvalSession": + from protest.evals.session import EvalSession + + return EvalSession + msg = f"module {__name__!r} has no attribute {name!r}" + raise AttributeError(msg) diff --git a/protest/evals/evaluator.py b/protest/evals/evaluator.py index cd8a615..61a8a72 100644 --- a/protest/evals/evaluator.py +++ b/protest/evals/evaluator.py @@ -38,18 +38,18 @@ async def llm_judge(ctx: EvalContext, model: str = "haiku") -> bool: from dataclasses import dataclass, field from typing import Any, Generic, TypeVar -I = TypeVar("I") -O = TypeVar("O") +InputT = TypeVar("InputT") +OutputT = TypeVar("OutputT") @dataclass -class EvalContext(Generic[I, O]): +class EvalContext(Generic[InputT, OutputT]): """Context passed to evaluator functions.""" name: str - inputs: I - output: O - expected_output: O | None + inputs: InputT + output: OutputT + expected_output: OutputT | None metadata: Any duration: float @@ -138,15 +138,15 @@ def extract_scores_from_result(result: Any, evaluator_name: str) -> list[Any]: if ann is None or get_origin(ann) is not Annotated: continue for meta in get_args(ann)[1:]: - if isinstance(meta, type) and issubclass(meta, (Metric, Verdict, Reason)): + if isinstance(meta, type) and issubclass( + meta, (Metric, Verdict, Reason) + ): scores.append(EvalScore(name=f.name, value=getattr(result, f.name))) break return scores type_name = type(result).__name__ - raise TypeError( - f"Evaluator must return bool or dataclass, got {type_name}" - ) + raise TypeError(f"Evaluator must return bool or dataclass, got {type_name}") def evaluator(fn: Any) -> Any: diff --git a/protest/evals/evaluators.py b/protest/evals/evaluators.py index b9b1475..d2cd632 100644 --- a/protest/evals/evaluators.py +++ b/protest/evals/evaluators.py @@ -44,7 +44,9 @@ class WordOverlapResult: @evaluator -def contains_keywords(ctx: EvalContext, keywords: list[str], min_recall: float = 0.0) -> ContainsKeywordsResult: +def contains_keywords( + ctx: EvalContext, keywords: list[str], min_recall: float = 0.0 +) -> ContainsKeywordsResult: """Check that the output contains expected keywords (case-insensitive).""" output_lower = ctx.output.lower() found = sum(1 for kw in keywords if kw.lower() in output_lower) diff --git a/protest/evals/results_writer.py b/protest/evals/results_writer.py index 0054e25..0c670a8 100644 --- a/protest/evals/results_writer.py +++ b/protest/evals/results_writer.py @@ -140,10 +140,7 @@ def _render_case(case: EvalCaseResult) -> str: def _format_score(score: EvalScore) -> str: - if score.is_metric: - icon = "·" - else: - icon = "✓" if score.passed else "✗" + icon = "·" if score.is_metric else ("✓" if score.passed else "✗") return f"- **{score.name}**: {score.value} {icon}" diff --git a/protest/evals/types.py b/protest/evals/types.py index ac61181..121264f 100644 --- a/protest/evals/types.py +++ b/protest/evals/types.py @@ -61,7 +61,11 @@ def is_verdict(self) -> bool: @property def is_metric(self) -> bool: - return not self.skipped and isinstance(self.value, (int, float)) and not isinstance(self.value, bool) + return ( + not self.skipped + and isinstance(self.value, (int, float)) + and not isinstance(self.value, bool) + ) @property def is_reason(self) -> bool: diff --git a/protest/evals/wrapper.py b/protest/evals/wrapper.py index 0251f98..537282b 100644 --- a/protest/evals/wrapper.py +++ b/protest/evals/wrapper.py @@ -13,7 +13,11 @@ from typing import Any from protest.entities.events import EvalPayload, EvalScoreEntry -from protest.evals.evaluator import EvalContext, ShortCircuit, extract_scores_from_result +from protest.evals.evaluator import ( + EvalContext, + ShortCircuit, + extract_scores_from_result, +) from protest.evals.types import EvalScore @@ -182,7 +186,8 @@ async def run_evaluators( async def _run_short_circuit( - evaluators: list[Any], ctx: EvalContext[Any, Any], + evaluators: list[Any], + ctx: EvalContext[Any, Any], ) -> list[EvalScore]: """Run evaluators in order, stop at first Verdict=False.""" scores: list[EvalScore] = [] @@ -200,7 +205,9 @@ async def _run_short_circuit( if any(s.is_verdict and not s.passed for s in extracted): # Mark remaining evaluators as skipped for skipped_ev in evaluators[i + 1 :]: - skipped_name = getattr(skipped_ev, "__name__", type(skipped_ev).__name__) + skipped_name = getattr( + skipped_ev, "__name__", type(skipped_ev).__name__ + ) scores.append(EvalScore(name=skipped_name, value=False, skipped=True)) break return scores diff --git a/protest/reporting/ascii.py b/protest/reporting/ascii.py index a52c509..ea4040d 100644 --- a/protest/reporting/ascii.py +++ b/protest/reporting/ascii.py @@ -1,9 +1,11 @@ +import sys import traceback from pathlib import Path from typing import Any from typing_extensions import Self +from protest.console import strip_markup from protest.entities import ( FixtureInfo, HandlerInfo, @@ -19,6 +21,7 @@ TestStartInfo, TestTeardownInfo, ) +from protest.evals.types import EvalSuiteReport from protest.plugin import PluginBase, PluginContext from protest.reporting.verbosity import Verbosity @@ -149,8 +152,6 @@ def on_test_teardown_start(self, info: TestTeardownInfo) -> None: @staticmethod def _print_bypass(msg: str) -> None: - import sys - stream = getattr(sys.stdout, "_original", sys.stdout) stream.write(msg + "\n") stream.flush() @@ -260,10 +261,6 @@ def _print_failure_detail(self, result: TestResult, *, is_error: bool) -> None: print(f" {line}") def on_user_print(self, data: Any) -> None: - import sys - - from protest.console import strip_markup - msg, raw = data text = msg if raw else strip_markup(msg) stream = getattr(sys.stdout, "_original", sys.stdout) @@ -271,8 +268,6 @@ def on_user_print(self, data: Any) -> None: stream.flush() def on_eval_suite_end(self, report: Any) -> None: - from protest.evals.types import EvalSuiteReport - if not isinstance(report, EvalSuiteReport): return stats = report.all_score_stats() diff --git a/protest/reporting/rich_reporter.py b/protest/reporting/rich_reporter.py index 414cb49..5e1e96b 100644 --- a/protest/reporting/rich_reporter.py +++ b/protest/reporting/rich_reporter.py @@ -1,9 +1,12 @@ +import logging +import sys import traceback from argparse import ArgumentParser from pathlib import Path from typing import Any from rich.console import Console # type: ignore[import-not-found] +from rich.table import Table # type: ignore[import-not-found] from typing_extensions import Self from protest.entities import ( @@ -21,6 +24,7 @@ TestStartInfo, TestTeardownInfo, ) +from protest.evals.types import EvalSuiteReport from protest.plugin import PluginBase, PluginContext from protest.reporting.verbosity import Verbosity @@ -151,8 +155,6 @@ def _maybe_show_logs(self, result: TestResult) -> None: """Show captured log records if --show-logs is active.""" if not self._show_logs or not result.log_records: return - import logging - min_level = getattr(logging, self._show_logs.upper(), logging.INFO) for record in result.log_records: if record.levelno >= min_level: @@ -170,10 +172,6 @@ def _maybe_show_logs(self, result: TestResult) -> None: def _print_bypass(self, message: str) -> None: """Print bypassing capture (for lifecycle messages emitted during tests).""" - import sys - - from rich.console import Console - stream = getattr(sys.stdout, "_original", sys.stdout) Console(file=stream, highlight=False).print(message) @@ -397,10 +395,6 @@ def _print_failure_detail(self, result: TestResult, *, is_error: bool) -> None: self._print(f"[dim]{escaped_line}[/]") def on_user_print(self, data: Any) -> None: - import sys - - from rich.console import Console - msg, raw = data # Write to the real stdout, bypassing capture stream = getattr(sys.stdout, "_original", sys.stdout) @@ -411,10 +405,6 @@ def on_user_print(self, data: Any) -> None: c.print(f"[dim] │[/] {msg}") def on_eval_suite_end(self, report: Any) -> None: - from rich.table import Table - - from protest.evals.types import EvalSuiteReport - if not isinstance(report, EvalSuiteReport): return stats = report.all_score_stats() @@ -444,8 +434,16 @@ def on_eval_suite_end(self, report: Any) -> None: self._print( f" [cyan]Eval: {report.suite_name} ({report.total_count} cases)[/]" ) - rate_pct = report.pass_rate * 100 - color = "green" if rate_pct >= 100 else "yellow" if rate_pct >= 50 else "red" + full_pass = 100 + half_pass = 50 + rate_pct = report.pass_rate * full_pass + color = ( + "green" + if rate_pct >= full_pass + else "yellow" + if rate_pct >= half_pass + else "red" + ) self._print( f" [{color}]Passed: {report.passed_count}/{report.total_count} ({rate_pct:.1f}%)[/]" ) diff --git a/tests/evals/test_e2e.py b/tests/evals/test_e2e.py index 6e35762..9bdaead 100644 --- a/tests/evals/test_e2e.py +++ b/tests/evals/test_e2e.py @@ -13,13 +13,25 @@ from __future__ import annotations import json +import subprocess from dataclasses import dataclass -from pathlib import Path +from pathlib import Path # noqa: TC003 — used at runtime (pytest tmp_path) from typing import Annotated, Any -from protest import ForEach, From, ProTestSession +from protest import ForEach, From, ProTestSession, Use, fixture +from protest.api import run_session +from protest.core.collector import Collector from protest.core.runner import TestRunner -from protest.evals import EvalContext, EvalSession, Metric, ModelInfo, Verdict, evaluator +from protest.core.suite import ProTestSuite +from protest.evals import ( + EvalContext, + EvalSession, + Metric, + ModelInfo, + ShortCircuit, + Verdict, + evaluator, +) from protest.evals.evaluators import ( contains_expected, contains_keywords, @@ -31,6 +43,12 @@ not_empty, word_overlap, ) +from protest.evals.hashing import compute_case_hash, compute_eval_hash +from protest.evals.results_writer import EvalResultsWriter +from protest.evals.types import EvalSuiteReport # noqa: TC001 — used at runtime +from protest.filters.kind import KindFilterPlugin +from protest.history.storage import append_entry, clean_dirty +from protest.plugin import PluginBase, PluginContext # --------------------------------------------------------------------------- # Fixtures: deterministic evaluators + task @@ -159,8 +177,6 @@ class TestKindFiltering: """Suites have kind, filtering works.""" def test_test_suite_has_kind_test(self) -> None: - from protest.core.suite import ProTestSuite - suite = ProTestSuite("my_tests") assert suite.kind == "test" @@ -174,9 +190,6 @@ def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: assert any(s.kind == "eval" for s in session._suites) def test_kind_filter_keeps_only_matching(self) -> None: - from protest.core.suite import ProTestSuite - from protest.filters.kind import KindFilterPlugin - test_suite = ProTestSuite("tests") eval_suite = ProTestSuite("evals", kind="eval") @@ -193,8 +206,6 @@ def eval_one() -> None: session.add_suite(test_suite) session.add_suite(eval_suite) - from protest.core.collector import Collector - items = Collector().collect(session) assert len(items) == 2 @@ -206,8 +217,6 @@ def eval_one() -> None: def test_unified_session_runs_tests_only(self) -> None: """protest run behavior: only kind=test suites.""" - from protest.core.suite import ProTestSuite - session = ProTestSession() test_suite = ProTestSuite("unit") @@ -223,9 +232,6 @@ def test_a() -> None: def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: return echo_task(case["inputs"]) - from protest.api import run_session - from protest.plugin import PluginContext - ctx = PluginContext(args={"kind_filter": "test"}) run_session(session, ctx=ctx) @@ -233,8 +239,6 @@ def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: def test_unified_session_runs_evals_only(self) -> None: """protest eval behavior: only kind=eval suites.""" - from protest.core.suite import ProTestSuite - session = ProTestSession() test_suite = ProTestSuite("unit") @@ -250,9 +254,6 @@ def test_a() -> None: def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: return echo_task(case["inputs"]) - from protest.api import run_session - from protest.plugin import PluginContext - ctx = PluginContext(args={"kind_filter": "eval"}) run_session(session, ctx=ctx) @@ -272,9 +273,6 @@ class TestEvalOutput: """ def test_report_contains_score_stats(self) -> None: - from protest.evals.types import EvalSuiteReport - from protest.plugin import PluginBase - reports: list[EvalSuiteReport] = [] class ReportCapture(PluginBase): @@ -300,9 +298,6 @@ def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: assert any(s.name == "accuracy" for s in stats) def test_report_has_pass_count(self) -> None: - from protest.evals.types import EvalSuiteReport - from protest.plugin import PluginBase - reports: list[EvalSuiteReport] = [] class ReportCapture(PluginBase): @@ -327,8 +322,6 @@ def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: def test_failed_eval_has_error_with_score_details(self) -> None: """When an eval case fails, the error message includes score details.""" - from protest.plugin import PluginBase - errors: list[Any] = [] class ErrorCollector(PluginBase): @@ -345,8 +338,6 @@ def on_test_fail(self, result: Any) -> None: def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: return echo_task(case["inputs"]) - from protest.api import run_session - run_session(session) # case_fail has matches_expected=False @@ -362,8 +353,6 @@ class TestEvalPayloadFlow: """EvalPayload flows through the framework correctly.""" def test_test_result_has_eval_payload(self) -> None: - from protest.plugin import PluginBase - collected: list[Any] = [] class Collector(PluginBase): @@ -395,8 +384,6 @@ def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: def test_lifecycle_events_have_case_id_in_node_id(self) -> None: """setup_done/teardown_start events carry node_id with [case_id].""" - from protest.plugin import PluginBase - setup_ids: list[str] = [] teardown_ids: list[str] = [] @@ -427,8 +414,6 @@ def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: def test_evaluator_exception_is_error_not_fail(self) -> None: """An evaluator that raises is treated as error (infra), not test fail.""" - from protest.plugin import PluginBase - results: list[Any] = [] class Collector(PluginBase): @@ -463,8 +448,6 @@ def eval_echo(case: Annotated[dict, From(single_case)]) -> str: assert "LLM judge timeout" in str(results[0].error) def test_non_eval_test_has_no_payload(self) -> None: - from protest.plugin import PluginBase - collected: list[Any] = [] class Collector(PluginBase): @@ -497,8 +480,6 @@ class TestHistory: """JSONL history format and querying.""" def _run_eval(self, tmp_path: Path) -> None: - from protest.api import run_session - session = EvalSession(model=ModelInfo(name="test-model"), history_dir=tmp_path) @session.eval(evaluators=[fake_accuracy]) @@ -540,8 +521,6 @@ def test_history_entry_format(self, tmp_path: Path) -> None: assert "cases" in suite def test_history_test_run_has_null_evals(self, tmp_path: Path) -> None: - from protest.api import run_session - session = ProTestSession(history=True, history_dir=tmp_path) @session.test() @@ -561,8 +540,6 @@ def test_history_multiple_runs_append(self, tmp_path: Path) -> None: assert len(lines) == 2 def test_history_metadata_included(self, tmp_path: Path) -> None: - from protest.api import run_session - session = EvalSession( history_dir=tmp_path, metadata={"env": "test", "version": "1.0"}, @@ -589,13 +566,9 @@ class TestCleanDirty: def test_clean_dirty_removes_current_head_only(self, tmp_path: Path) -> None: # Entry with current HEAD + dirty - import subprocess - - from protest.history.storage import append_entry, clean_dirty - try: current_commit = subprocess.run( - ["git", "rev-parse", "HEAD"], + ["git", "rev-parse", "HEAD"], # noqa: S607 capture_output=True, text=True, timeout=5, @@ -634,8 +607,6 @@ class TestCaseHashing: def test_case_hash_stored_in_history(self, tmp_path: Path) -> None: """History entries include case_hash and eval_hash per case.""" - from protest.api import run_session - session = EvalSession(history_dir=tmp_path) @session.eval(evaluators=[fake_accuracy]) @@ -656,24 +627,18 @@ def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: def test_case_hash_changes_on_input_change(self) -> None: """Different inputs -> different case_hash.""" - from protest.evals.hashing import compute_case_hash - h1 = compute_case_hash("hello world", "expected") h2 = compute_case_hash("hello world modified", "expected") assert h1 != h2 def test_case_hash_stable_for_same_input(self) -> None: """Same inputs -> same case_hash (deterministic).""" - from protest.evals.hashing import compute_case_hash - h1 = compute_case_hash("hello world", "expected") h2 = compute_case_hash("hello world", "expected") assert h1 == h2 def test_eval_hash_changes_on_evaluator_change(self) -> None: """Different evaluators -> different eval_hash.""" - from protest.evals.hashing import compute_eval_hash - e1 = contains_keywords(keywords=["hello"]) e2 = contains_keywords(keywords=["hello", "world"]) h1 = compute_eval_hash([e1]) @@ -762,8 +727,6 @@ class TestScoringV2: def test_bool_evaluator_pass(self) -> None: """Evaluator returning True -> case passes.""" - from protest.plugin import PluginBase - results: list[Any] = [] class Collector(PluginBase): @@ -798,8 +761,6 @@ def eval_echo(case: Annotated[dict, From(single_case)]) -> str: def test_dataclass_without_bool_is_tracking_only(self) -> None: """Dataclass with only float fields -> tracking-only, always passes.""" - from protest.plugin import PluginBase - results: list[Any] = [] class Collector(PluginBase): @@ -833,8 +794,6 @@ def eval_echo(case: Annotated[dict, From(single_case)]) -> str: def test_float_return_raises_type_error(self) -> None: """Evaluator returning naked float -> TypeError (caught as fixture error).""" - from protest.plugin import PluginBase - results: list[Any] = [] class Collector(PluginBase): @@ -870,8 +829,6 @@ class TestShortCircuit: """ShortCircuit: skip expensive evaluators when cheap ones fail.""" def test_short_circuit_skips_on_fail(self) -> None: - from protest.evals import ShortCircuit - call_log: list[str] = [] @evaluator @@ -899,8 +856,6 @@ def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: assert call_log.count("expensive") == 1 def test_short_circuit_all_pass(self) -> None: - from protest.evals import ShortCircuit - call_log: list[str] = [] @evaluator @@ -913,7 +868,9 @@ def check_b(ctx: EvalContext) -> bool: call_log.append("b") return True - single = ForEach([{"inputs": "x", "expected": "x", "name": "c1"}], ids=lambda c: c["name"]) + single = ForEach( + [{"inputs": "x", "expected": "x", "name": "c1"}], ids=lambda c: c["name"] + ) session = EvalSession() @session.eval(evaluators=[ShortCircuit([check_a, check_b])]) @@ -936,8 +893,6 @@ class TestResultsFiles: """Per-case markdown files written to .protest/results/_/.""" def _run_eval(self, tmp_path: Path) -> Path: - from protest.evals.results_writer import EvalResultsWriter - results_dir = tmp_path / "results" session = EvalSession() writer = EvalResultsWriter(history_dir=tmp_path) @@ -994,8 +949,6 @@ class TestMultiDatasetHistory: """Multiple @session.eval calls produce distinct suites in history.""" def _run_multi(self, tmp_path: Path) -> dict[str, Any]: - from protest.api import run_session - pipeline_cases = ForEach( [ {"inputs": "hello", "expected": "hello", "name": "c1"}, @@ -1059,7 +1012,6 @@ def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: def test_task_with_session_fixture_is_injected(self) -> None: """Une fixture session-scoped est injectee dans task via Use().""" - from protest import Use, fixture @fixture() def prefix_service() -> str: @@ -1090,8 +1042,6 @@ async def eval_prefixed( def test_session_fixture_resolved_once_for_all_cases(self) -> None: """Une session fixture ne doit etre appelee qu'une fois meme avec N cas.""" - from protest import Use, fixture - call_count = 0 @fixture() diff --git a/uv.lock b/uv.lock index aa650bb..34a6ee8 100644 --- a/uv.lock +++ b/uv.lock @@ -764,7 +764,7 @@ wheels = [ [[package]] name = "protest" -version = "0.1.1" +version = "0.1.2" source = { editable = "." } dependencies = [ { name = "typing-extensions" }, From 5f5e9a03cde8ef815d6379c54fd5536944b67991 Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Tue, 31 Mar 2026 09:09:40 +0200 Subject: [PATCH 09/60] =?UTF-8?q?feat(evals):=20Judge=20protocol=20?= =?UTF-8?q?=E2=80=94=20LLM-as-judge=20via=20inversion=20of=20dependency?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ProTest owns the interface, user plugs in their LLM library. - Judge protocol: `async judge(prompt, output_type) -> JudgeResponse[T]` - JudgeResponse wraps output with optional tokens/cost tracking - EvalContext.judge() unwraps for evaluators, accumulates usage stats - JudgeInfo auto-derived from instance for history - EvalPayload carries judge_call_count, tokens, cost per case - EvalSession(judge=MyJudge()) wires through to evaluators - suite.eval(judge=) for standalone usage - 19 new tests (protocol, ctx.judge, e2e, structured output, tokens) --- docs/evals.md | 119 ++++++++++++- protest/core/session.py | 2 + protest/core/suite.py | 2 + protest/entities/events.py | 4 + protest/evals/__init__.py | 4 + protest/evals/evaluator.py | 51 +++++- protest/evals/session.py | 12 +- protest/evals/types.py | 57 +++++- protest/evals/wrapper.py | 16 +- tests/evals/test_judge.py | 354 +++++++++++++++++++++++++++++++++++++ 10 files changed, 607 insertions(+), 14 deletions(-) create mode 100644 tests/evals/test_judge.py diff --git a/docs/evals.md b/docs/evals.md index b8cd74b..e13812b 100644 --- a/docs/evals.md +++ b/docs/evals.md @@ -162,6 +162,8 @@ The threshold (`min_recall`) is a parameter of the evaluator, not a framework co ### Async (LLM Judge) +Use `ctx.judge()` for structured LLM evaluation (requires `judge=` on `EvalSession`): + ```python @dataclass class JudgeResult: @@ -171,11 +173,15 @@ class JudgeResult: @evaluator async def llm_judge(ctx: EvalContext, rubric: str = "", min_score: float = 0.7) -> JudgeResult: - result = await judge_agent.run(f"Evaluate: {ctx.output}\nCriteria: {rubric}") - score = parse_score(result) - return JudgeResult(accuracy=score, accurate_enough=score >= min_score, reason=result.explanation) + return await ctx.judge( + f"Evaluate this response on a 0-1 scale.\n\n" + f"Response: {ctx.output}\nCriteria: {rubric}", + JudgeResult, + ) ``` +The judge handles structured output — no text parsing needed. See [Judge](#judge) for setup. + ### Per-Case Thresholds Different thresholds per case = different evaluator bindings: @@ -218,14 +224,16 @@ EvalCase(inputs="...", evaluators=[llm_judge(rubric="Check factual accuracy")]) ### EvalContext -| Field | Type | Description | -|-------|------|-------------| +| Field / Method | Type | Description | +|----------------|------|-------------| | `name` | `str` | Case name | | `inputs` | `I` | Case inputs | | `output` | `O` | Task return value | | `expected_output` | `O \| None` | From `EvalCase.expected` | | `metadata` | `Any` | From `EvalCase.metadata` | | `duration` | `float` | Task execution time (seconds) | +| `judge(prompt, type)` | `async` | Call the configured LLM judge (see [Judge](#judge)) | +| `judge_call_count` | `int` | Number of judge calls made | ### Built-in Evaluators @@ -270,6 +278,107 @@ async def pipeline_eval( session = EvalSession(model=ModelInfo(name="qwen-2.5")) ``` +## Judge + +A `Judge` is a protocol for LLM-as-judge evaluators. ProTest owns the interface — you plug in your LLM library. + +### The Protocol + +```python +class Judge(Protocol): + async def judge(self, prompt: str, output_type: type[T]) -> T: ... +``` + +Minimal contract: takes a prompt and a return type, returns a typed result. All configuration (model, temperature, system prompt, max_tokens) lives in your implementation's constructor, not in the protocol. + +### Writing a Judge + +The `judge()` method returns a `JudgeResponse[T]` that wraps the output with optional usage stats: + +```python +from pydantic_ai import Agent +from protest.evals import JudgeResponse + +class PydanticAIJudge: + name = "gpt-4o-mini" # used in history + provider = "openai" # optional, used in history + + def __init__(self, model: str = "gpt-4o-mini", temperature: float = 0): + self.model = model + self.temperature = temperature + + async def judge(self, prompt: str, output_type: type[T]) -> JudgeResponse[T]: + agent = Agent(self.model, output_type=output_type) + result = await agent.run(prompt) + usage = result.usage() + return JudgeResponse( + output=result.output, + input_tokens=usage.request_tokens, + output_tokens=usage.response_tokens, + cost=usage.request_tokens * 0.15/1e6 + usage.response_tokens * 0.60/1e6, + ) +``` + +Tokens and cost are optional — omit them if your provider doesn't expose usage data: + +```python +return JudgeResponse(output=result.output) # tokens/cost = None, that's fine +``` + +### Configuring the Judge + +```python +session = EvalSession( + model=ModelInfo(name="qwen-2.5"), + judge=PydanticAIJudge(model="gpt-4o-mini", temperature=0), +) +``` + +`JudgeInfo` (name, provider) is derived automatically from the instance for history tracking. + +### Using the Judge in Evaluators + +Evaluators access the judge via `ctx.judge()`: + +```python +@dataclass +class JudgeResult: + accurate: Annotated[bool, Verdict] + reason: Annotated[str, Reason] = "" + +@evaluator +async def llm_rubric(ctx: EvalContext, rubric: str = "") -> JudgeResult: + return await ctx.judge( + f"Evaluate this response.\n\nResponse: {ctx.output}\nCriteria: {rubric}", + JudgeResult, # structured output — no text parsing + ) +``` + +For simple verdicts, use `bool` or `str` as `output_type`: + +```python +@evaluator +async def simple_judge(ctx: EvalContext) -> bool: + return await ctx.judge(f"Is this a valid answer? {ctx.output}", bool) +``` + +### No Judge Configured + +If an evaluator calls `ctx.judge()` and no judge was passed to `EvalSession`, a `RuntimeError` is raised. This is treated as an **infrastructure error** (not a test failure), same as a fixture crash. + +### Usage Tracking + +Each call to `ctx.judge()` is counted. Tokens and cost from `JudgeResponse` are accumulated per case and flow to `EvalPayload`: + +| Field | Description | +|-------|-------------| +| `judge_call_count` | Number of judge calls | +| `judge_input_tokens` | Total input tokens | +| `judge_output_tokens` | Total output tokens | +| `judge_cost` | Total cost (user-computed) | + +These are available in history, letting you track LLM usage across runs. + ## Evaluator Errors If an evaluator raises an exception (e.g. LLM judge timeout), the case is marked as **error** (not fail). The stack trace appears in the output. diff --git a/protest/core/session.py b/protest/core/session.py index 3224028..59962c5 100644 --- a/protest/core/session.py +++ b/protest/core/session.py @@ -86,6 +86,7 @@ def __init__( self._metadata: dict[str, Any] = dict(metadata) if metadata else {} self._eval_model: ModelInfo | None = None # set by EvalSession self._eval_judge: JudgeInfo | None = None # set by EvalSession + self._eval_judge_instance: Any = None # set by EvalSession async def resolve_autouse(self) -> None: """Resolve all session autouse fixtures at session start.""" @@ -241,6 +242,7 @@ def decorator(func: FuncT) -> FuncT: func, evaluators or [], expected_key, + judge=self._eval_judge_instance, ) suite.test(tags=tags, timeout=timeout, is_eval=True)(wrapper) self.add_suite(suite) diff --git a/protest/core/suite.py b/protest/core/suite.py index 1a8da5d..262d908 100644 --- a/protest/core/suite.py +++ b/protest/core/suite.py @@ -166,6 +166,7 @@ def eval( expected_key: str = "expected", tags: list[str] | None = None, timeout: float | None = None, + judge: Any = None, ) -> Callable[[FuncT], FuncT]: """Register a scored eval test on this suite.""" @@ -174,6 +175,7 @@ def decorator(func: FuncT) -> FuncT: func, evaluators or [], expected_key, + judge=judge, ) self.test(tags=tags, timeout=timeout, is_eval=True)(wrapper) return func diff --git a/protest/entities/events.py b/protest/entities/events.py index afb8971..33b43b2 100644 --- a/protest/entities/events.py +++ b/protest/entities/events.py @@ -30,6 +30,10 @@ class EvalPayload: scores: dict[str, EvalScoreEntry] = field(default_factory=dict) case_hash: str = "" eval_hash: str = "" + judge_call_count: int = 0 + judge_input_tokens: int = 0 + judge_output_tokens: int = 0 + judge_cost: float = 0.0 @dataclass(frozen=True, slots=True) diff --git a/protest/evals/__init__.py b/protest/evals/__init__.py index fdb5115..8e53005 100644 --- a/protest/evals/__init__.py +++ b/protest/evals/__init__.py @@ -13,7 +13,9 @@ EvalCaseResult, EvalScore, EvalSuiteReport, + Judge, JudgeInfo, + JudgeResponse, ModelInfo, ScoreStats, ) @@ -25,7 +27,9 @@ "EvalScore", "EvalSession", "EvalSuiteReport", + "Judge", "JudgeInfo", + "JudgeResponse", "Metric", "ModelInfo", "Reason", diff --git a/protest/evals/evaluator.py b/protest/evals/evaluator.py index 61a8a72..701fe5c 100644 --- a/protest/evals/evaluator.py +++ b/protest/evals/evaluator.py @@ -36,10 +36,14 @@ async def llm_judge(ctx: EvalContext, model: str = "haiku") -> bool: import functools import inspect from dataclasses import dataclass, field -from typing import Any, Generic, TypeVar +from typing import TYPE_CHECKING, Any, Generic, TypeVar + +if TYPE_CHECKING: + from protest.evals.types import Judge InputT = TypeVar("InputT") OutputT = TypeVar("OutputT") +T = TypeVar("T") @dataclass @@ -52,6 +56,51 @@ class EvalContext(Generic[InputT, OutputT]): expected_output: OutputT | None metadata: Any duration: float + _judge: Judge | None = field(default=None, repr=False) + _judge_call_count: int = field(default=0, repr=False, init=False) + _judge_input_tokens: int = field(default=0, repr=False, init=False) + _judge_output_tokens: int = field(default=0, repr=False, init=False) + _judge_cost: float = field(default=0.0, repr=False, init=False) + + async def judge(self, prompt: str, output_type: type[T]) -> T: + """Call the configured LLM judge and return the typed output. + + Tokens and cost from JudgeResponse are accumulated internally + and flow to EvalPayload for history/display. The evaluator + only sees the unwrapped output. + + Raises RuntimeError if no judge was configured on the session. + """ + if self._judge is None: + raise RuntimeError( + f"Evaluator for case '{self.name}' called ctx.judge() but no " + "judge is configured. Pass judge= to EvalSession()." + ) + self._judge_call_count += 1 + response = await self._judge.judge(prompt, output_type) + if response.input_tokens is not None: + self._judge_input_tokens += response.input_tokens + if response.output_tokens is not None: + self._judge_output_tokens += response.output_tokens + if response.cost is not None: + self._judge_cost += response.cost + return response.output + + @property + def judge_call_count(self) -> int: + return self._judge_call_count + + @property + def judge_input_tokens(self) -> int: + return self._judge_input_tokens + + @property + def judge_output_tokens(self) -> int: + return self._judge_output_tokens + + @property + def judge_cost(self) -> float: + return self._judge_cost @dataclass diff --git a/protest/evals/session.py b/protest/evals/session.py index 82bea35..81f22d9 100644 --- a/protest/evals/session.py +++ b/protest/evals/session.py @@ -9,7 +9,7 @@ if TYPE_CHECKING: from pathlib import Path - from protest.evals.types import JudgeInfo, ModelInfo + from protest.evals.types import Judge, ModelInfo class EvalSession(ProTestSession): @@ -28,7 +28,7 @@ def __init__( self, *, model: ModelInfo | None = None, - judge: JudgeInfo | None = None, + judge: Judge | None = None, concurrency: int = 1, history: bool = True, history_dir: Path | None = None, @@ -41,4 +41,10 @@ def __init__( metadata=metadata, ) self._eval_model = model - self._eval_judge = judge + self._eval_judge_instance: Judge | None = judge + if judge is not None: + from protest.evals.types import JudgeInfo + + self._eval_judge = JudgeInfo.from_instance(judge) + else: + self._eval_judge = None diff --git a/protest/evals/types.py b/protest/evals/types.py index 121264f..e928c86 100644 --- a/protest/evals/types.py +++ b/protest/evals/types.py @@ -4,7 +4,55 @@ import statistics from dataclasses import dataclass, field -from typing import Any +from typing import Any, Generic, Protocol, TypeVar, runtime_checkable + +T = TypeVar("T") + + +@dataclass(frozen=True, slots=True) +class JudgeResponse(Generic[T]): + """Return type for Judge.judge() — wraps the output with optional usage stats. + + Evaluators never see this: ``ctx.judge()`` unwraps and returns ``output``. + ProTest accumulates tokens/cost for history and display. + + Usage:: + + return JudgeResponse( + output=result.output, + input_tokens=usage.request_tokens, + output_tokens=usage.response_tokens, + cost=0.003, + ) + + # Or minimal — tokens/cost are optional: + return JudgeResponse(output=result.output) + """ + + output: T + input_tokens: int | None = None + output_tokens: int | None = None + cost: float | None = None + + +@runtime_checkable +class Judge(Protocol): + """Protocol for LLM judge implementations. + + All configuration (model, temperature, system_prompt, max_tokens) + lives in the constructor of the implementation, NOT in this protocol. + + Usage:: + + class MyJudge: + async def judge(self, prompt: str, output_type: type[T]) -> JudgeResponse[T]: + result = await agent.run(prompt) + return JudgeResponse(output=result.output, input_tokens=100) + + session = EvalSession(judge=MyJudge()) + """ + + async def judge(self, prompt: str, output_type: type[T]) -> JudgeResponse[T]: ... @dataclass(frozen=True, slots=True) @@ -40,6 +88,13 @@ class JudgeInfo: evaluators: tuple[str, ...] = () extra: dict[str, Any] = field(default_factory=dict) + @classmethod + def from_instance(cls, judge: Judge) -> JudgeInfo: + """Extract metadata from a Judge instance (duck-typed).""" + name = getattr(judge, "name", None) or type(judge).__name__ + provider = getattr(judge, "provider", None) + return cls(name=str(name), provider=provider) + @dataclass(frozen=True, slots=True) class EvalScore: diff --git a/protest/evals/wrapper.py b/protest/evals/wrapper.py index 537282b..b94c217 100644 --- a/protest/evals/wrapper.py +++ b/protest/evals/wrapper.py @@ -25,6 +25,7 @@ def make_eval_wrapper( func: Any, evaluators: list[Any], expected_key: str, + judge: Any = None, ) -> Any: """Wrap a function to run evaluators on its return value.""" @@ -46,7 +47,7 @@ async def eval_wrapper(**kwargs: Any) -> EvalPayload: per_case = _extract_per_case_evaluators(kwargs) all_evaluators.extend(per_case) - scores = await run_evaluators( + scores, eval_ctx = await run_evaluators( all_evaluators, case_name, inputs, @@ -54,6 +55,7 @@ async def eval_wrapper(**kwargs: Any) -> EvalPayload: expected, metadata, task_duration, + judge=judge, ) from protest.evals.hashing import compute_case_hash, compute_eval_hash @@ -75,6 +77,10 @@ async def eval_wrapper(**kwargs: Any) -> EvalPayload: }, case_hash=compute_case_hash(inputs, expected), eval_hash=compute_eval_hash(all_evaluators), + judge_call_count=eval_ctx.judge_call_count, + judge_input_tokens=eval_ctx.judge_input_tokens, + judge_output_tokens=eval_ctx.judge_output_tokens, + judge_cost=eval_ctx.judge_cost, ) return eval_wrapper @@ -155,8 +161,9 @@ async def run_evaluators( expected_output: Any, metadata: Any, duration: float, -) -> list[EvalScore]: - """Run evaluators and convert results to EvalScores.""" + judge: Any = None, +) -> tuple[list[EvalScore], EvalContext[Any, Any]]: + """Run evaluators and return (scores, ctx with judge stats).""" ctx = EvalContext( name=case_name, inputs=inputs, @@ -164,6 +171,7 @@ async def run_evaluators( expected_output=expected_output, metadata=metadata, duration=duration, + _judge=judge, ) scores: list[EvalScore] = [] @@ -182,7 +190,7 @@ async def run_evaluators( raise FixtureError(f"evaluator '{evaluator_name}'", exc) from exc - return scores + return scores, ctx async def _run_short_circuit( diff --git a/tests/evals/test_judge.py b/tests/evals/test_judge.py new file mode 100644 index 0000000..10106d9 --- /dev/null +++ b/tests/evals/test_judge.py @@ -0,0 +1,354 @@ +"""Tests for the Judge protocol and ctx.judge() integration.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Annotated, Any + +import pytest + +from protest import ForEach, From +from protest.core.runner import TestRunner +from protest.evals import ( + EvalContext, + EvalSession, + Judge, + JudgeResponse, + ModelInfo, + Verdict, + evaluator, +) +from protest.evals.types import JudgeInfo +from protest.plugin import PluginBase + + +# --------------------------------------------------------------------------- +# Fake judge for testing +# --------------------------------------------------------------------------- + + +class FakeJudge: + """Minimal Judge implementation for tests.""" + + name = "fake-judge" + provider = "test" + + async def judge(self, prompt: str, output_type: type) -> JudgeResponse: + if output_type is bool: + return JudgeResponse( + output="pass" in prompt.lower(), + input_tokens=10, + output_tokens=5, + cost=0.001, + ) + if output_type is str: + return JudgeResponse(output=f"judged: {prompt[:20]}") + # For dataclass types, try to construct with defaults + return JudgeResponse(output=output_type()) + + +class BareJudge: + """Judge without name/provider attrs — tests fallback.""" + + async def judge(self, prompt: str, output_type: type) -> JudgeResponse: + return JudgeResponse(output=True) + + +# --------------------------------------------------------------------------- +# Protocol compliance +# --------------------------------------------------------------------------- + + +class TestJudgeProtocol: + def test_fake_judge_satisfies_protocol(self) -> None: + assert isinstance(FakeJudge(), Judge) + + def test_bare_judge_satisfies_protocol(self) -> None: + assert isinstance(BareJudge(), Judge) + + def test_non_judge_rejected(self) -> None: + class NotAJudge: + def evaluate(self, prompt: str) -> str: + return "nope" + + assert not isinstance(NotAJudge(), Judge) + + +# --------------------------------------------------------------------------- +# JudgeInfo.from_instance +# --------------------------------------------------------------------------- + + +class TestJudgeInfoExtraction: + def test_from_instance_with_attrs(self) -> None: + info = JudgeInfo.from_instance(FakeJudge()) + assert info.name == "fake-judge" + assert info.provider == "test" + + def test_from_instance_fallback_to_class_name(self) -> None: + info = JudgeInfo.from_instance(BareJudge()) + assert info.name == "BareJudge" + assert info.provider is None + + +# --------------------------------------------------------------------------- +# EvalContext.judge() +# --------------------------------------------------------------------------- + + +class TestEvalContextJudge: + @pytest.mark.asyncio + async def test_judge_happy_path(self) -> None: + judge = FakeJudge() + ctx = EvalContext( + name="test_case", + inputs="q", + output="a", + expected_output=None, + metadata=None, + duration=0.1, + _judge=judge, + ) + result = await ctx.judge("pass this", bool) + assert result is True + + @pytest.mark.asyncio + async def test_judge_str_output(self) -> None: + judge = FakeJudge() + ctx = EvalContext( + name="test_case", + inputs="q", + output="a", + expected_output=None, + metadata=None, + duration=0.1, + _judge=judge, + ) + result = await ctx.judge("hello world", str) + assert result == "judged: hello world" + + @pytest.mark.asyncio + async def test_judge_raises_without_judge(self) -> None: + ctx = EvalContext( + name="my_case", + inputs="q", + output="a", + expected_output=None, + metadata=None, + duration=0.1, + ) + with pytest.raises(RuntimeError, match="no judge is configured"): + await ctx.judge("test", bool) + + @pytest.mark.asyncio + async def test_judge_error_mentions_case_name(self) -> None: + ctx = EvalContext( + name="chatbot_eval", + inputs="q", + output="a", + expected_output=None, + metadata=None, + duration=0.1, + ) + with pytest.raises(RuntimeError, match="chatbot_eval"): + await ctx.judge("test", bool) + + @pytest.mark.asyncio + async def test_judge_call_count(self) -> None: + judge = FakeJudge() + ctx = EvalContext( + name="test_case", + inputs="q", + output="a", + expected_output=None, + metadata=None, + duration=0.1, + _judge=judge, + ) + assert ctx.judge_call_count == 0 + await ctx.judge("pass 1", bool) + assert ctx.judge_call_count == 1 + await ctx.judge("pass 2", bool) + await ctx.judge("pass 3", bool) + assert ctx.judge_call_count == 3 + + @pytest.mark.asyncio + async def test_judge_tokens_accumulated(self) -> None: + judge = FakeJudge() # returns input_tokens=10, output_tokens=5 for bool + ctx = EvalContext( + name="test_case", + inputs="q", + output="a", + expected_output=None, + metadata=None, + duration=0.1, + _judge=judge, + ) + await ctx.judge("pass 1", bool) + await ctx.judge("pass 2", bool) + assert ctx.judge_input_tokens == 20 + assert ctx.judge_output_tokens == 10 + + @pytest.mark.asyncio + async def test_judge_cost_accumulated(self) -> None: + judge = FakeJudge() # returns cost=0.001 for bool + ctx = EvalContext( + name="test_case", + inputs="q", + output="a", + expected_output=None, + metadata=None, + duration=0.1, + _judge=judge, + ) + await ctx.judge("pass 1", bool) + await ctx.judge("pass 2", bool) + assert ctx.judge_cost == pytest.approx(0.002) + + @pytest.mark.asyncio + async def test_judge_none_tokens_not_accumulated(self) -> None: + """JudgeResponse with tokens=None doesn't affect accumulation.""" + judge = FakeJudge() + ctx = EvalContext( + name="test_case", + inputs="q", + output="a", + expected_output=None, + metadata=None, + duration=0.1, + _judge=judge, + ) + await ctx.judge("hello", str) # FakeJudge returns no tokens for str + assert ctx.judge_input_tokens == 0 + assert ctx.judge_output_tokens == 0 + assert ctx.judge_cost == 0.0 + + +# --------------------------------------------------------------------------- +# E2E: EvalSession with judge +# --------------------------------------------------------------------------- + +single_case = ForEach( + [{"inputs": "hello", "expected": "hello", "name": "case_1"}], + ids=lambda c: c["name"], +) + + +class TestJudgeE2E: + def test_judge_available_in_evaluator(self) -> None: + """Full run: evaluator calls ctx.judge(), result is pass.""" + + @evaluator + async def judge_evaluator(ctx: EvalContext) -> bool: + return await ctx.judge("pass this", bool) + + session = EvalSession(judge=FakeJudge()) + + @session.eval(evaluators=[judge_evaluator]) + def eval_echo(case: Annotated[dict, From(single_case)]) -> str: + return case["inputs"] + + runner = TestRunner(session) + result = runner.run() + assert result.success is True + + def test_no_judge_is_fixture_error(self) -> None: + """Evaluator calls ctx.judge() without judge configured → infra error.""" + + @evaluator + async def needs_judge(ctx: EvalContext) -> bool: + return await ctx.judge("test", bool) + + session = EvalSession() # no judge + + @session.eval(evaluators=[needs_judge]) + def eval_echo(case: Annotated[dict, From(single_case)]) -> str: + return case["inputs"] + + results: list[Any] = [] + + class Collector(PluginBase): + name = "collector" + + def on_test_fail(self, result: Any) -> None: + results.append(result) + + session.register_plugin(Collector()) + runner = TestRunner(session) + result = runner.run() + assert result.success is False + assert len(results) == 1 + assert results[0].is_fixture_error is True + + def test_judge_call_count_in_payload(self) -> None: + """judge_call_count flows through to EvalPayload.""" + + @evaluator + async def double_judge(ctx: EvalContext) -> bool: + r1 = await ctx.judge("pass first", bool) + r2 = await ctx.judge("pass second", bool) + return r1 and r2 + + session = EvalSession(judge=FakeJudge()) + + @session.eval(evaluators=[double_judge]) + def eval_echo(case: Annotated[dict, From(single_case)]) -> str: + return case["inputs"] + + results: list[Any] = [] + + class Collector(PluginBase): + name = "collector" + + def on_test_pass(self, result: Any) -> None: + results.append(result) + + session.register_plugin(Collector()) + runner = TestRunner(session) + runner.run() + assert len(results) == 1 + payload = results[0].eval_payload + assert payload is not None + assert payload.judge_call_count == 2 + assert payload.judge_input_tokens == 20 # 10 per call × 2 + assert payload.judge_output_tokens == 10 # 5 per call × 2 + assert payload.judge_cost == pytest.approx(0.002) # 0.001 per call × 2 + + def test_judge_info_derived_from_instance(self) -> None: + """EvalSession derives JudgeInfo from Judge instance.""" + session = EvalSession(judge=FakeJudge()) + assert session._eval_judge is not None + assert session._eval_judge.name == "fake-judge" + assert session._eval_judge.provider == "test" + + def test_no_judge_no_judge_info(self) -> None: + """EvalSession without judge has no JudgeInfo.""" + session = EvalSession() + assert session._eval_judge is None + + def test_judge_with_structured_output(self) -> None: + """Judge returns structured dataclass via output_type.""" + + @dataclass + class JudgeVerdict: + ok: Annotated[bool, Verdict] + + class StructuredJudge: + name = "structured" + + async def judge(self, prompt: str, output_type: type) -> JudgeResponse: + return JudgeResponse(output=output_type(ok=True)) + + @evaluator + async def struct_evaluator(ctx: EvalContext) -> JudgeVerdict: + return await ctx.judge("evaluate this", JudgeVerdict) + + session = EvalSession(judge=StructuredJudge()) + + @session.eval(evaluators=[struct_evaluator]) + def eval_echo(case: Annotated[dict, From(single_case)]) -> str: + return case["inputs"] + + runner = TestRunner(session) + result = runner.run() + assert result.success is True From 015c451b6852e371c8cd557299b03d9f0f578ab7 Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Tue, 31 Mar 2026 12:50:34 +0200 Subject: [PATCH 10/60] fix(reporters): show in/out token split in eval usage summary Task: 45.2k in / 27.1k out, $0.0142 Judge: 5 calls, 800 in / 400 out, $0.0030 --- protest/core/runner.py | 7 +++ protest/entities/events.py | 3 + protest/evals/__init__.py | 2 + protest/evals/types.py | 73 +++++++++++++++++++++++ protest/evals/wrapper.py | 21 ++++++- protest/reporting/ascii.py | 21 +++++++ protest/reporting/rich_reporter.py | 23 ++++++++ tests/evals/test_judge.py | 95 ++++++++++++++++++++++++++++++ 8 files changed, 243 insertions(+), 2 deletions(-) diff --git a/protest/core/runner.py b/protest/core/runner.py index 4e58544..124cb44 100644 --- a/protest/core/runner.py +++ b/protest/core/runner.py @@ -227,4 +227,11 @@ def _build_eval_case_result(result: TestResult) -> EvalCaseResult: expected_output=payload.expected_output, case_hash=payload.case_hash, eval_hash=payload.eval_hash, + task_input_tokens=payload.task_input_tokens, + task_output_tokens=payload.task_output_tokens, + task_cost=payload.task_cost, + judge_call_count=payload.judge_call_count, + judge_input_tokens=payload.judge_input_tokens, + judge_output_tokens=payload.judge_output_tokens, + judge_cost=payload.judge_cost, ) diff --git a/protest/entities/events.py b/protest/entities/events.py index 33b43b2..d67388d 100644 --- a/protest/entities/events.py +++ b/protest/entities/events.py @@ -30,6 +30,9 @@ class EvalPayload: scores: dict[str, EvalScoreEntry] = field(default_factory=dict) case_hash: str = "" eval_hash: str = "" + task_input_tokens: int = 0 + task_output_tokens: int = 0 + task_cost: float = 0.0 judge_call_count: int = 0 judge_input_tokens: int = 0 judge_output_tokens: int = 0 diff --git a/protest/evals/__init__.py b/protest/evals/__init__.py index 8e53005..628a275 100644 --- a/protest/evals/__init__.py +++ b/protest/evals/__init__.py @@ -18,6 +18,7 @@ JudgeResponse, ModelInfo, ScoreStats, + TaskResult, ) __all__ = [ @@ -35,6 +36,7 @@ "Reason", "ScoreStats", "ShortCircuit", + "TaskResult", "Verdict", "evaluator", ] diff --git a/protest/evals/types.py b/protest/evals/types.py index e928c86..7c8e14c 100644 --- a/protest/evals/types.py +++ b/protest/evals/types.py @@ -9,6 +9,36 @@ T = TypeVar("T") +@dataclass(frozen=True, slots=True) +class TaskResult(Generic[T]): + """Optional wrapper for eval task return values with usage stats. + + Return this instead of a plain value to report LLM usage for the + system under test. ProTest unwraps it transparently — evaluators + see the plain output. + + Usage:: + + @session.eval(evaluators=[...]) + async def my_eval(case) -> TaskResult[str]: + result = await agent.run(case.inputs) + usage = result.usage() + return TaskResult( + output=result.output, + input_tokens=usage.request_tokens, + output_tokens=usage.response_tokens, + cost=0.003, + ) + + # Or just return str directly — TaskResult is opt-in. + """ + + output: T + input_tokens: int | None = None + output_tokens: int | None = None + cost: float | None = None + + @dataclass(frozen=True, slots=True) class JudgeResponse(Generic[T]): """Return type for Judge.judge() — wraps the output with optional usage stats. @@ -149,6 +179,13 @@ class EvalCaseResult: expected_output: Any = None case_hash: str = "" eval_hash: str = "" + task_input_tokens: int = 0 + task_output_tokens: int = 0 + task_cost: float = 0.0 + judge_call_count: int = 0 + judge_input_tokens: int = 0 + judge_output_tokens: int = 0 + judge_cost: float = 0.0 @property def numeric_scores(self) -> dict[str, float]: @@ -228,3 +265,39 @@ def score_stats(self, name: str) -> ScoreStats: def all_score_stats(self) -> list[ScoreStats]: return [self.score_stats(n) for n in sorted(self.score_names())] + + @property + def total_task_input_tokens(self) -> int: + return sum(c.task_input_tokens for c in self.cases) + + @property + def total_task_output_tokens(self) -> int: + return sum(c.task_output_tokens for c in self.cases) + + @property + def total_task_tokens(self) -> int: + return self.total_task_input_tokens + self.total_task_output_tokens + + @property + def total_task_cost(self) -> float: + return sum(c.task_cost for c in self.cases) + + @property + def total_judge_calls(self) -> int: + return sum(c.judge_call_count for c in self.cases) + + @property + def total_judge_input_tokens(self) -> int: + return sum(c.judge_input_tokens for c in self.cases) + + @property + def total_judge_output_tokens(self) -> int: + return sum(c.judge_output_tokens for c in self.cases) + + @property + def total_judge_tokens(self) -> int: + return self.total_judge_input_tokens + self.total_judge_output_tokens + + @property + def total_judge_cost(self) -> float: + return sum(c.judge_cost for c in self.cases) diff --git a/protest/evals/wrapper.py b/protest/evals/wrapper.py index b94c217..82b21ad 100644 --- a/protest/evals/wrapper.py +++ b/protest/evals/wrapper.py @@ -38,11 +38,25 @@ async def eval_wrapper(**kwargs: Any) -> EvalPayload: start = time.perf_counter() if asyncio.iscoroutinefunction(func): - output = await func(**kwargs) + raw_output = await func(**kwargs) else: - output = func(**kwargs) + raw_output = func(**kwargs) task_duration = time.perf_counter() - start + # Unwrap TaskResult if returned + from protest.evals.types import TaskResult + + task_input_tokens = 0 + task_output_tokens = 0 + task_cost = 0.0 + if isinstance(raw_output, TaskResult): + output = raw_output.output + task_input_tokens = raw_output.input_tokens or 0 + task_output_tokens = raw_output.output_tokens or 0 + task_cost = raw_output.cost or 0.0 + else: + output = raw_output + all_evaluators = list(evaluators) per_case = _extract_per_case_evaluators(kwargs) all_evaluators.extend(per_case) @@ -77,6 +91,9 @@ async def eval_wrapper(**kwargs: Any) -> EvalPayload: }, case_hash=compute_case_hash(inputs, expected), eval_hash=compute_eval_hash(all_evaluators), + task_input_tokens=task_input_tokens, + task_output_tokens=task_output_tokens, + task_cost=task_cost, judge_call_count=eval_ctx.judge_call_count, judge_input_tokens=eval_ctx.judge_input_tokens, judge_output_tokens=eval_ctx.judge_output_tokens, diff --git a/protest/reporting/ascii.py b/protest/reporting/ascii.py index ea4040d..1620789 100644 --- a/protest/reporting/ascii.py +++ b/protest/reporting/ascii.py @@ -63,6 +63,19 @@ def _format_duration(seconds: float) -> str: return f"{seconds:.2f}s" +def _format_tokens(tokens: int) -> str: + return f"{tokens / 1000:.1f}k" if tokens >= 1000 else str(tokens) + + +def _format_usage(input_tokens: int, output_tokens: int, cost: float) -> str: + parts: list[str] = [] + if input_tokens > 0 or output_tokens > 0: + parts.append(f"{_format_tokens(input_tokens)} in / {_format_tokens(output_tokens)} out") + if cost > 0: + parts.append(f"${cost:.4f}") + return ", ".join(parts) + + class AsciiReporter(PluginBase): """Plain ASCII reporter. No colors, no emojis. Works everywhere.""" @@ -285,6 +298,14 @@ def on_eval_suite_end(self, report: Any) -> None: print(" " + "─" * 60) rate_pct = report.pass_rate * 100 print(f" Passed: {report.passed_count}/{report.total_count} ({rate_pct:.1f}%)") + if report.total_task_tokens > 0 or report.total_task_cost > 0: + print(f" Task: {_format_usage(report.total_task_input_tokens, report.total_task_output_tokens, report.total_task_cost)}") + if report.total_judge_calls > 0: + judge_parts = [f"{report.total_judge_calls} calls"] + usage = _format_usage(report.total_judge_input_tokens, report.total_judge_output_tokens, report.total_judge_cost) + if usage: + judge_parts.append(usage) + print(f" Judge: {', '.join(judge_parts)}") print() def on_session_complete(self, result: SessionResult) -> None: diff --git a/protest/reporting/rich_reporter.py b/protest/reporting/rich_reporter.py index 5e1e96b..c699f71 100644 --- a/protest/reporting/rich_reporter.py +++ b/protest/reporting/rich_reporter.py @@ -53,6 +53,21 @@ def _format_duration(seconds: float) -> str: return f"{seconds:.2f}s" +def _format_tokens(tokens: int) -> str: + """Format token count: 1234 → '1.2k', 45 → '45'.""" + return f"{tokens / 1000:.1f}k" if tokens >= 1000 else str(tokens) + + +def _format_usage(input_tokens: int, output_tokens: int, cost: float) -> str: + """Format usage stats as 'Xk in / Yk out, $0.0042'.""" + parts: list[str] = [] + if input_tokens > 0 or output_tokens > 0: + parts.append(f"{_format_tokens(input_tokens)} in / {_format_tokens(output_tokens)} out") + if cost > 0: + parts.append(f"${cost:.4f}") + return ", ".join(parts) + + def _format_eval_scores_inline(result: TestResult) -> str: """Format eval scores for inline display (e.g. ' bg_score=0.8 char_id=1.0').""" if not result.eval_payload: @@ -447,6 +462,14 @@ def on_eval_suite_end(self, report: Any) -> None: self._print( f" [{color}]Passed: {report.passed_count}/{report.total_count} ({rate_pct:.1f}%)[/]" ) + if report.total_task_tokens > 0 or report.total_task_cost > 0: + self._print(f" [dim]Task: {_format_usage(report.total_task_input_tokens, report.total_task_output_tokens, report.total_task_cost)}[/]") + if report.total_judge_calls > 0: + judge_parts = [f"{report.total_judge_calls} calls"] + usage = _format_usage(report.total_judge_input_tokens, report.total_judge_output_tokens, report.total_judge_cost) + if usage: + judge_parts.append(usage) + self._print(f" [dim]Judge: {', '.join(judge_parts)}[/]") def on_session_complete(self, result: SessionResult) -> None: has_non_eval_failures = any(not r.is_eval for r in self._failed_results) diff --git a/tests/evals/test_judge.py b/tests/evals/test_judge.py index 10106d9..0a6006f 100644 --- a/tests/evals/test_judge.py +++ b/tests/evals/test_judge.py @@ -15,6 +15,7 @@ Judge, JudgeResponse, ModelInfo, + TaskResult, Verdict, evaluator, ) @@ -352,3 +353,97 @@ def eval_echo(case: Annotated[dict, From(single_case)]) -> str: runner = TestRunner(session) result = runner.run() assert result.success is True + + +# --------------------------------------------------------------------------- +# TaskResult: SUT usage tracking +# --------------------------------------------------------------------------- + + +class TestTaskResult: + def test_task_result_unwrapped_for_evaluators(self) -> None: + """TaskResult is unwrapped — evaluators see the plain output.""" + + @evaluator + def check_output(ctx: EvalContext) -> bool: + return ctx.output == "hello" # sees str, not TaskResult + + session = EvalSession() + + @session.eval(evaluators=[check_output]) + def eval_echo(case: Annotated[dict, From(single_case)]) -> TaskResult[str]: + return TaskResult( + output=case["inputs"], + input_tokens=100, + output_tokens=50, + cost=0.01, + ) + + runner = TestRunner(session) + result = runner.run() + assert result.success is True + + def test_task_usage_in_payload(self) -> None: + """TaskResult tokens/cost flow through to EvalPayload.""" + + @evaluator + def always_pass(ctx: EvalContext) -> bool: + return True + + session = EvalSession() + + @session.eval(evaluators=[always_pass]) + def eval_echo(case: Annotated[dict, From(single_case)]) -> TaskResult[str]: + return TaskResult( + output=case["inputs"], + input_tokens=200, + output_tokens=80, + cost=0.005, + ) + + results: list[Any] = [] + + class Collector(PluginBase): + name = "collector" + + def on_test_pass(self, result: Any) -> None: + results.append(result) + + session.register_plugin(Collector()) + runner = TestRunner(session) + runner.run() + assert len(results) == 1 + payload = results[0].eval_payload + assert payload is not None + assert payload.task_input_tokens == 200 + assert payload.task_output_tokens == 80 + assert payload.task_cost == pytest.approx(0.005) + + def test_plain_return_has_zero_task_usage(self) -> None: + """Plain return (no TaskResult) has zero task usage.""" + + @evaluator + def always_pass(ctx: EvalContext) -> bool: + return True + + session = EvalSession() + + @session.eval(evaluators=[always_pass]) + def eval_echo(case: Annotated[dict, From(single_case)]) -> str: + return case["inputs"] + + results: list[Any] = [] + + class Collector(PluginBase): + name = "collector" + + def on_test_pass(self, result: Any) -> None: + results.append(result) + + session.register_plugin(Collector()) + runner = TestRunner(session) + runner.run() + payload = results[0].eval_payload + assert payload.task_input_tokens == 0 + assert payload.task_output_tokens == 0 + assert payload.task_cost == 0.0 From 8e748ce0bba111f323aedf41c17a46561c107fc8 Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Tue, 31 Mar 2026 20:23:37 +0200 Subject: [PATCH 11/60] fix(history): exclude error-only runs from stats, propagate is_error flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixture crashes (errored >= total_cases) were counted in pass_rates, score_values, and flaky — polluting stats with noise. Now: - EvalCaseResult.is_error propagated from TestResult.is_fixture_error - History serializes errored count per suite + is_error per case - _aggregate_suites skips error-only runs from stats entirely - _track_cases skips error cases from score_values and flaky - Error runs still visible in `protest history --runs` Also: docs/evals.md updated for TaskResult section and Judge protocol fix. --- docs/evals.md | 37 +++++++- protest/cli/history.py | 9 +- protest/core/runner.py | 1 + protest/evals/history.py | 2 + protest/evals/types.py | 7 +- tests/test_history_stats.py | 164 ++++++++++++++++++++++++++++++++++++ 6 files changed, 216 insertions(+), 4 deletions(-) create mode 100644 tests/test_history_stats.py diff --git a/docs/evals.md b/docs/evals.md index e13812b..006c403 100644 --- a/docs/evals.md +++ b/docs/evals.md @@ -286,10 +286,10 @@ A `Judge` is a protocol for LLM-as-judge evaluators. ProTest owns the interface ```python class Judge(Protocol): - async def judge(self, prompt: str, output_type: type[T]) -> T: ... + async def judge(self, prompt: str, output_type: type[T]) -> JudgeResponse[T]: ... ``` -Minimal contract: takes a prompt and a return type, returns a typed result. All configuration (model, temperature, system prompt, max_tokens) lives in your implementation's constructor, not in the protocol. +Minimal contract: takes a prompt and a return type, returns a `JudgeResponse` wrapping the typed result with optional usage stats. All configuration (model, temperature, system prompt, max_tokens) lives in your implementation's constructor, not in the protocol. ### Writing a Judge @@ -379,6 +379,39 @@ Each call to `ctx.judge()` is counted. Tokens and cost from `JudgeResponse` are These are available in history, letting you track LLM usage across runs. +## TaskResult (SUT Usage Tracking) + +If your eval task calls an LLM, you can report usage by returning `TaskResult` instead of a plain value: + +```python +from protest.evals import TaskResult + +@session.eval(evaluators=[my_scorer]) +async def chatbot(case: Annotated[EvalCase, From(cases)]) -> TaskResult[str]: + result = await agent.run(case.inputs) + usage = result.usage() + return TaskResult( + output=result.output, + input_tokens=usage.request_tokens, + output_tokens=usage.response_tokens, + cost=usage.request_tokens * 0.10/1e6 + usage.response_tokens * 0.30/1e6, + ) +``` + +This is **opt-in** — returning a plain `str` still works. ProTest unwraps `TaskResult` transparently: evaluators see the plain output, usage stats flow to the reporter and history. + +## Usage Display + +When task or judge usage data is available, ProTest shows a summary after the eval stats: + +``` + Passed: 16/26 (61.5%) + Task: 45.2k in / 27.1k out, $0.0142 + Judge: 5 calls, 800 in / 400 out, $0.0030 +``` + +Lines only appear when there is data. No `TaskResult` = no Task line. No judge configured = no Judge line. + ## Evaluator Errors If an evaluator raises an exception (e.g. LLM judge timeout), the case is marked as **error** (not fail). The stack trace appears in the output. diff --git a/protest/cli/history.py b/protest/cli/history.py index f9eb7ac..33b230b 100644 --- a/protest/cli/history.py +++ b/protest/cli/history.py @@ -406,9 +406,13 @@ def _aggregate_suites(entries: list[dict[str, Any]]) -> dict[str, dict[str, Any] "score_values": {}, } s = suites[name] - s["n_runs"] += 1 + errored = data.get("errored", 0) total = data.get("total_cases", 0) passed = data.get("passed", 0) + # Skip error-only runs (fixture crashes) from stats + if errored and errored >= total: + continue + s["n_runs"] += 1 if total: s["pass_rates"].append(passed / total) _track_cases(s, data.get("cases", {})) @@ -427,6 +431,9 @@ def _track_cases(suite: dict[str, Any], cases: dict[str, Any]) -> None: for cn, cd in cases.items(): if not isinstance(cd, dict): continue + # Skip errored cases (fixture crashes) from stats + if cd.get("is_error"): + continue if cn not in suite["cases_seen"]: suite["cases_seen"][cn] = {"runs": 0, "fails": 0} suite["cases_seen"][cn]["runs"] += 1 diff --git a/protest/core/runner.py b/protest/core/runner.py index 124cb44..f6bab5b 100644 --- a/protest/core/runner.py +++ b/protest/core/runner.py @@ -234,4 +234,5 @@ def _build_eval_case_result(result: TestResult) -> EvalCaseResult: judge_input_tokens=payload.judge_input_tokens, judge_output_tokens=payload.judge_output_tokens, judge_cost=payload.judge_cost, + is_error=result.is_fixture_error, ) diff --git a/protest/evals/history.py b/protest/evals/history.py index f7f2544..5551736 100644 --- a/protest/evals/history.py +++ b/protest/evals/history.py @@ -100,6 +100,7 @@ def _build_entry( "total_cases": report.total_count, "passed": report.passed_count, "failed": report.failed_count, + "errored": report.errored_count, "pass_rate": round(report.pass_rate, 4), "duration": round(report.duration, 2), "cases": {c.case_name: _serialize_case(c) for c in report.cases}, @@ -138,6 +139,7 @@ def _build_entry( def _serialize_case(case: EvalCaseResult) -> dict[str, Any]: entry: dict[str, Any] = { "passed": case.passed, + "is_error": case.is_error, "duration": round(case.duration, 3), "scores": {s.name: s.value for s in case.scores if s.is_metric}, "case_hash": case.case_hash, diff --git a/protest/evals/types.py b/protest/evals/types.py index 7c8e14c..323f32a 100644 --- a/protest/evals/types.py +++ b/protest/evals/types.py @@ -186,6 +186,7 @@ class EvalCaseResult: judge_input_tokens: int = 0 judge_output_tokens: int = 0 judge_cost: float = 0.0 + is_error: bool = False @property def numeric_scores(self) -> dict[str, float]: @@ -241,7 +242,11 @@ def passed_count(self) -> int: @property def failed_count(self) -> int: - return sum(1 for c in self.cases if not c.passed) + return sum(1 for c in self.cases if not c.passed and not c.is_error) + + @property + def errored_count(self) -> int: + return sum(1 for c in self.cases if c.is_error) @property def total_count(self) -> int: diff --git a/tests/test_history_stats.py b/tests/test_history_stats.py new file mode 100644 index 0000000..cc99c17 --- /dev/null +++ b/tests/test_history_stats.py @@ -0,0 +1,164 @@ +"""Tests for history stats — error-only runs must be excluded from stats.""" + +from __future__ import annotations + +from protest.cli.history import _aggregate_suites, _rich_score_arrows + + +def _make_entry( + suite_name: str = "pipeline", + passed: int = 0, + total: int = 0, + errored: int = 0, + cases: dict | None = None, +) -> dict: + """Build a minimal history entry with one suite.""" + return { + "suites": { + suite_name: { + "kind": "eval", + "passed": passed, + "total_cases": total, + "errored": errored, + "cases": cases or {}, + } + } + } + + +def _case(passed: bool, score: float) -> dict: + return {"passed": passed, "scores": {"accuracy": score}} + + +def _error_case() -> dict: + return {"passed": False, "is_error": True, "scores": {}} + + +class TestErrorRunsExcludedFromStats: + """Error-only runs (fixture crashes) are excluded from stats.""" + + def test_error_runs_not_counted(self) -> None: + """Runs where errored >= total should not count in n_runs or pass_rates.""" + entries = [ + _make_entry(passed=29, total=39, cases={"a": _case(True, 0.8)}), + _make_entry(passed=0, total=1, errored=1, cases={"x": _error_case()}), + _make_entry(passed=0, total=1, errored=1, cases={"x": _error_case()}), + _make_entry(passed=28, total=39, cases={"a": _case(True, 0.7)}), + _make_entry(passed=0, total=1, errored=1, cases={"x": _error_case()}), + ] + + suites = _aggregate_suites(entries) + s = suites["pipeline"] + + # Only 2 real runs counted + assert s["n_runs"] == 2 + assert len(s["pass_rates"]) == 2 + # pass_rates reflect only real runs + assert s["pass_rates"][0] == 29 / 39 + assert s["pass_rates"][1] == 28 / 39 + + def test_error_cases_not_tracked(self) -> None: + """Cases with is_error=True should not appear in cases_seen or score_values.""" + entries = [ + _make_entry( + passed=1, + total=2, + errored=0, + cases={ + "real_case": _case(True, 0.9), + "errored_case": _error_case(), + }, + ), + ] + + suites = _aggregate_suites(entries) + s = suites["pipeline"] + assert "real_case" in s["cases_seen"] + assert "errored_case" not in s["cases_seen"] + assert len(s["score_values"]["accuracy"]) == 1 + + def test_error_cases_not_in_flaky(self) -> None: + """Error cases should never appear as flaky.""" + entries = [ + _make_entry(passed=1, total=1, cases={"a": _case(True, 0.9)}), + _make_entry( + passed=0, + total=1, + errored=1, + cases={"a": _error_case()}, + ), + ] + + suites = _aggregate_suites(entries) + s = suites["pipeline"] + # Only the real run is counted + assert s["n_runs"] == 1 + assert len(s["flaky"]) == 0 + + def test_all_error_runs_produce_empty_suite(self) -> None: + """If ALL runs are errors, suite has 0 runs and empty stats.""" + entries = [ + _make_entry(passed=0, total=1, errored=1, cases={"x": _error_case()}), + _make_entry(passed=0, total=1, errored=1, cases={"x": _error_case()}), + ] + + suites = _aggregate_suites(entries) + # Suite exists but has 0 real runs + assert suites["pipeline"]["n_runs"] == 0 + assert suites["pipeline"]["pass_rates"] == [] + + def test_mixed_real_and_error_runs(self) -> None: + """Real data pattern: mostly errors with a few real runs.""" + entries = [ + _make_entry(passed=0, total=1, errored=1), # error + _make_entry(passed=0, total=1, errored=1), # error + _make_entry(passed=29, total=39, cases={"a": _case(True, 0.7)}), # real + _make_entry(passed=0, total=1, errored=1), # error + _make_entry(passed=0, total=1, errored=1), # error + _make_entry(passed=28, total=39, cases={"a": _case(True, 0.8)}), # real + _make_entry(passed=0, total=1, errored=1), # error + _make_entry(passed=0, total=1, errored=1), # error + _make_entry(passed=0, total=1, errored=1), # error + _make_entry(passed=0, total=1, errored=1), # error + ] + + suites = _aggregate_suites(entries) + s = suites["pipeline"] + + assert s["n_runs"] == 2 # not 10 + assert len(s["pass_rates"]) == 2 + # Arrows reflect only the 2 real runs, not the 8 errors + arrows = _rich_score_arrows(s["score_values"]) + # accuracy went 0.7 → 0.8 → should show ↗ + assert "↗" in arrows + + +class TestScoreArrowsWithCleanData: + """Score arrows with only real runs (no errors to filter).""" + + def test_stable_scores_show_no_trend(self) -> None: + entries = [ + _make_entry(passed=2, total=2, cases={"a": _case(True, 0.8)}), + _make_entry(passed=2, total=2, cases={"a": _case(True, 0.8)}), + ] + suites = _aggregate_suites(entries) + arrows = _rich_score_arrows(suites["pipeline"]["score_values"]) + assert "→" in arrows + + def test_improving_scores_show_up(self) -> None: + entries = [ + _make_entry(passed=1, total=1, cases={"a": _case(True, 0.3)}), + _make_entry(passed=1, total=1, cases={"a": _case(True, 0.9)}), + ] + suites = _aggregate_suites(entries) + arrows = _rich_score_arrows(suites["pipeline"]["score_values"]) + assert "↗" in arrows + + def test_declining_scores_show_down(self) -> None: + entries = [ + _make_entry(passed=1, total=1, cases={"a": _case(True, 0.9)}), + _make_entry(passed=1, total=1, cases={"a": _case(True, 0.3)}), + ] + suites = _aggregate_suites(entries) + arrows = _rich_score_arrows(suites["pipeline"]["score_values"]) + assert "↘" in arrows From 6149633b52ee94623dc36cd2dd63b1b2a9fa814c Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Tue, 31 Mar 2026 21:53:29 +0200 Subject: [PATCH 12/60] =?UTF-8?q?refactor:=20remove=20getattr=20abuse=20?= =?UTF-8?q?=E2=80=94=20proper=20typing=20and=20Protocol=20contracts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove defensive getattr in session.py where types are known - Type plugin setup(session: ProTestSession) instead of Any - Add name/provider to Judge Protocol — explicit contract - Delete ModelInfo.from_agent and JudgeInfo.from_instance — user wires - Fix lint: PLR2004 magic values, PLR0912 noqa, ambiguous unicode --- examples/yorkshire/app/chatbot.py | 2 +- protest/core/session.py | 8 +++---- protest/evals/history.py | 9 +++---- protest/evals/session.py | 2 +- protest/evals/types.py | 27 +++++---------------- protest/history/plugin.py | 11 +++++---- protest/reporting/ascii.py | 23 ++++++++++++++---- protest/reporting/rich_reporter.py | 23 ++++++++++++++---- tests/evals/test_judge.py | 38 +++++++++--------------------- 9 files changed, 72 insertions(+), 71 deletions(-) diff --git a/examples/yorkshire/app/chatbot.py b/examples/yorkshire/app/chatbot.py index dedc1e4..82ca519 100644 --- a/examples/yorkshire/app/chatbot.py +++ b/examples/yorkshire/app/chatbot.py @@ -25,7 +25,7 @@ } -def yorkshire_chatbot(question: str) -> str: +def yorkshire_chatbot(question: str) -> str: # noqa: PLR0912 """Fake chatbot that answers questions about Yorkshire Terriers. Simulates a RAG pipeline: keyword matching → fact retrieval → response generation. diff --git a/protest/core/session.py b/protest/core/session.py index 59962c5..7ea04f3 100644 --- a/protest/core/session.py +++ b/protest/core/session.py @@ -228,10 +228,10 @@ async def my_eval(case: Annotated[dict, From(cases)]) -> str: def decorator(func: FuncT) -> FuncT: suite_name = name or func.__name__ suite_meta: dict[str, Any] = {} - resolved_model = model or getattr(self, "_eval_model", None) + resolved_model = model or self._eval_model if resolved_model: suite_meta["model"] = resolved_model.name - suite_meta["provider"] = getattr(resolved_model, "provider", None) + suite_meta["provider"] = resolved_model.provider suite = ProTestSuite( name=suite_name, tags=list(tags or []), @@ -384,8 +384,8 @@ def _wire_eval_support(self) -> None: if self._eval_judge: judge_dict = { "name": self._eval_judge.name, - "provider": getattr(self._eval_judge, "provider", None), - "evaluators": list(getattr(self._eval_judge, "evaluators", ())), + "provider": self._eval_judge.provider, + "evaluators": list(self._eval_judge.evaluators), } history = EvalHistoryPlugin( diff --git a/protest/evals/history.py b/protest/evals/history.py index 5551736..b607566 100644 --- a/protest/evals/history.py +++ b/protest/evals/history.py @@ -13,6 +13,7 @@ if TYPE_CHECKING: from pathlib import Path + from protest.core.session import ProTestSession from protest.evals.types import EvalCaseResult, EvalSuiteReport, ModelInfo from protest.plugin import PluginContext @@ -47,12 +48,12 @@ def __init__( def activate(cls, ctx: PluginContext) -> EvalHistoryPlugin | None: return None # Wired explicitly by session - def setup(self, session: Any) -> None: + def setup(self, session: ProTestSession) -> None: """Collect per-suite metadata from session.""" self._suite_metadata = {} - for suite in getattr(session, "suites", []): - if getattr(suite, "kind", "test") == "eval": - self._suite_metadata[suite.name] = getattr(suite, "suite_metadata", {}) + for suite in session.suites: + if suite.kind == "eval": + self._suite_metadata[suite.name] = suite.suite_metadata def on_eval_suite_end(self, report: EvalSuiteReport) -> None: """Collect suite reports as they arrive.""" diff --git a/protest/evals/session.py b/protest/evals/session.py index 81f22d9..ddace3d 100644 --- a/protest/evals/session.py +++ b/protest/evals/session.py @@ -45,6 +45,6 @@ def __init__( if judge is not None: from protest.evals.types import JudgeInfo - self._eval_judge = JudgeInfo.from_instance(judge) + self._eval_judge = JudgeInfo(name=judge.name, provider=judge.provider) else: self._eval_judge = None diff --git a/protest/evals/types.py b/protest/evals/types.py index 323f32a..59d2721 100644 --- a/protest/evals/types.py +++ b/protest/evals/types.py @@ -75,6 +75,9 @@ class Judge(Protocol): Usage:: class MyJudge: + name = "my-judge" + provider = "openai" + async def judge(self, prompt: str, output_type: type[T]) -> JudgeResponse[T]: result = await agent.run(prompt) return JudgeResponse(output=result.output, input_tokens=100) @@ -82,6 +85,9 @@ async def judge(self, prompt: str, output_type: type[T]) -> JudgeResponse[T]: session = EvalSession(judge=MyJudge()) """ + name: str + provider: str | None + async def judge(self, prompt: str, output_type: type[T]) -> JudgeResponse[T]: ... @@ -94,20 +100,6 @@ class ModelInfo: temperature: float | None = None extra: dict[str, Any] = field(default_factory=dict) - @classmethod - def from_agent(cls, agent: Any) -> ModelInfo: - """Extract model info from a pydantic-ai Agent (duck-typed).""" - model = getattr(agent, "model", None) - if model is None: - msg = "Agent has no model configured" - raise ValueError(msg) - if isinstance(model, str): - return cls(name=model) - model_name = getattr(model, "model_name", None) - if callable(model_name): - return cls(name=str(model_name())) - return cls(name=str(getattr(model, "name", None) or model)) - @dataclass(frozen=True, slots=True) class JudgeInfo: @@ -118,13 +110,6 @@ class JudgeInfo: evaluators: tuple[str, ...] = () extra: dict[str, Any] = field(default_factory=dict) - @classmethod - def from_instance(cls, judge: Judge) -> JudgeInfo: - """Extract metadata from a Judge instance (duck-typed).""" - name = getattr(judge, "name", None) or type(judge).__name__ - provider = getattr(judge, "provider", None) - return cls(name=str(name), provider=provider) - @dataclass(frozen=True, slots=True) class EvalScore: diff --git a/protest/history/plugin.py b/protest/history/plugin.py index 4fe80f6..e216fe6 100644 --- a/protest/history/plugin.py +++ b/protest/history/plugin.py @@ -13,6 +13,7 @@ if TYPE_CHECKING: from pathlib import Path + from protest.core.session import ProTestSession from protest.entities.events import SessionResult, TestResult from protest.plugin import PluginContext @@ -36,11 +37,11 @@ def __init__(self, history_dir: Path | None = None) -> None: def activate(cls, ctx: PluginContext) -> HistoryPlugin | None: return None # Wired explicitly by session - def setup(self, session: Any) -> None: - self._history_enabled = getattr(session, "history", False) - self._metadata = dict(getattr(session, "metadata", None) or {}) - for suite in getattr(session, "suites", []): - self._suite_kinds[suite.name] = getattr(suite, "kind", "test") + def setup(self, session: ProTestSession) -> None: + self._history_enabled = session.history + self._metadata = dict(session.metadata) + for suite in session.suites: + self._suite_kinds[suite.name] = suite.kind if not self._default_suite_name or self._default_suite_name == "tests": self._default_suite_name = suite.name diff --git a/protest/reporting/ascii.py b/protest/reporting/ascii.py index 1620789..9296ae6 100644 --- a/protest/reporting/ascii.py +++ b/protest/reporting/ascii.py @@ -63,14 +63,23 @@ def _format_duration(seconds: float) -> str: return f"{seconds:.2f}s" +_TOKEN_K_THRESHOLD = 1000 + + def _format_tokens(tokens: int) -> str: - return f"{tokens / 1000:.1f}k" if tokens >= 1000 else str(tokens) + return ( + f"{tokens / _TOKEN_K_THRESHOLD:.1f}k" + if tokens >= _TOKEN_K_THRESHOLD + else str(tokens) + ) def _format_usage(input_tokens: int, output_tokens: int, cost: float) -> str: parts: list[str] = [] if input_tokens > 0 or output_tokens > 0: - parts.append(f"{_format_tokens(input_tokens)} in / {_format_tokens(output_tokens)} out") + parts.append( + f"{_format_tokens(input_tokens)} in / {_format_tokens(output_tokens)} out" + ) if cost > 0: parts.append(f"${cost:.4f}") return ", ".join(parts) @@ -299,10 +308,16 @@ def on_eval_suite_end(self, report: Any) -> None: rate_pct = report.pass_rate * 100 print(f" Passed: {report.passed_count}/{report.total_count} ({rate_pct:.1f}%)") if report.total_task_tokens > 0 or report.total_task_cost > 0: - print(f" Task: {_format_usage(report.total_task_input_tokens, report.total_task_output_tokens, report.total_task_cost)}") + print( + f" Task: {_format_usage(report.total_task_input_tokens, report.total_task_output_tokens, report.total_task_cost)}" + ) if report.total_judge_calls > 0: judge_parts = [f"{report.total_judge_calls} calls"] - usage = _format_usage(report.total_judge_input_tokens, report.total_judge_output_tokens, report.total_judge_cost) + usage = _format_usage( + report.total_judge_input_tokens, + report.total_judge_output_tokens, + report.total_judge_cost, + ) if usage: judge_parts.append(usage) print(f" Judge: {', '.join(judge_parts)}") diff --git a/protest/reporting/rich_reporter.py b/protest/reporting/rich_reporter.py index c699f71..981b03f 100644 --- a/protest/reporting/rich_reporter.py +++ b/protest/reporting/rich_reporter.py @@ -53,16 +53,25 @@ def _format_duration(seconds: float) -> str: return f"{seconds:.2f}s" +_TOKEN_K_THRESHOLD = 1000 + + def _format_tokens(tokens: int) -> str: """Format token count: 1234 → '1.2k', 45 → '45'.""" - return f"{tokens / 1000:.1f}k" if tokens >= 1000 else str(tokens) + return ( + f"{tokens / _TOKEN_K_THRESHOLD:.1f}k" + if tokens >= _TOKEN_K_THRESHOLD + else str(tokens) + ) def _format_usage(input_tokens: int, output_tokens: int, cost: float) -> str: """Format usage stats as 'Xk in / Yk out, $0.0042'.""" parts: list[str] = [] if input_tokens > 0 or output_tokens > 0: - parts.append(f"{_format_tokens(input_tokens)} in / {_format_tokens(output_tokens)} out") + parts.append( + f"{_format_tokens(input_tokens)} in / {_format_tokens(output_tokens)} out" + ) if cost > 0: parts.append(f"${cost:.4f}") return ", ".join(parts) @@ -463,10 +472,16 @@ def on_eval_suite_end(self, report: Any) -> None: f" [{color}]Passed: {report.passed_count}/{report.total_count} ({rate_pct:.1f}%)[/]" ) if report.total_task_tokens > 0 or report.total_task_cost > 0: - self._print(f" [dim]Task: {_format_usage(report.total_task_input_tokens, report.total_task_output_tokens, report.total_task_cost)}[/]") + self._print( + f" [dim]Task: {_format_usage(report.total_task_input_tokens, report.total_task_output_tokens, report.total_task_cost)}[/]" + ) if report.total_judge_calls > 0: judge_parts = [f"{report.total_judge_calls} calls"] - usage = _format_usage(report.total_judge_input_tokens, report.total_judge_output_tokens, report.total_judge_cost) + usage = _format_usage( + report.total_judge_input_tokens, + report.total_judge_output_tokens, + report.total_judge_cost, + ) if usage: judge_parts.append(usage) self._print(f" [dim]Judge: {', '.join(judge_parts)}[/]") diff --git a/tests/evals/test_judge.py b/tests/evals/test_judge.py index 0a6006f..9e6fd11 100644 --- a/tests/evals/test_judge.py +++ b/tests/evals/test_judge.py @@ -14,15 +14,12 @@ EvalSession, Judge, JudgeResponse, - ModelInfo, TaskResult, Verdict, evaluator, ) -from protest.evals.types import JudgeInfo from protest.plugin import PluginBase - # --------------------------------------------------------------------------- # Fake judge for testing # --------------------------------------------------------------------------- @@ -31,8 +28,8 @@ class FakeJudge: """Minimal Judge implementation for tests.""" - name = "fake-judge" - provider = "test" + name: str = "fake-judge" + provider: str | None = "test" async def judge(self, prompt: str, output_type: type) -> JudgeResponse: if output_type is bool: @@ -49,7 +46,10 @@ async def judge(self, prompt: str, output_type: type) -> JudgeResponse: class BareJudge: - """Judge without name/provider attrs — tests fallback.""" + """Minimal Judge with required name/provider.""" + + name: str = "bare-judge" + provider: str | None = None async def judge(self, prompt: str, output_type: type) -> JudgeResponse: return JudgeResponse(output=True) @@ -75,23 +75,6 @@ def evaluate(self, prompt: str) -> str: assert not isinstance(NotAJudge(), Judge) -# --------------------------------------------------------------------------- -# JudgeInfo.from_instance -# --------------------------------------------------------------------------- - - -class TestJudgeInfoExtraction: - def test_from_instance_with_attrs(self) -> None: - info = JudgeInfo.from_instance(FakeJudge()) - assert info.name == "fake-judge" - assert info.provider == "test" - - def test_from_instance_fallback_to_class_name(self) -> None: - info = JudgeInfo.from_instance(BareJudge()) - assert info.name == "BareJudge" - assert info.provider is None - - # --------------------------------------------------------------------------- # EvalContext.judge() # --------------------------------------------------------------------------- @@ -311,9 +294,9 @@ def on_test_pass(self, result: Any) -> None: payload = results[0].eval_payload assert payload is not None assert payload.judge_call_count == 2 - assert payload.judge_input_tokens == 20 # 10 per call × 2 - assert payload.judge_output_tokens == 10 # 5 per call × 2 - assert payload.judge_cost == pytest.approx(0.002) # 0.001 per call × 2 + assert payload.judge_input_tokens == 20 # 10 per call x 2 + assert payload.judge_output_tokens == 10 # 5 per call x 2 + assert payload.judge_cost == pytest.approx(0.002) # 0.001 per call x 2 def test_judge_info_derived_from_instance(self) -> None: """EvalSession derives JudgeInfo from Judge instance.""" @@ -335,7 +318,8 @@ class JudgeVerdict: ok: Annotated[bool, Verdict] class StructuredJudge: - name = "structured" + name: str = "structured" + provider: str | None = None async def judge(self, prompt: str, output_type: type) -> JudgeResponse: return JudgeResponse(output=output_type(ok=True)) From c08125535139cbaff4e74dec27f0f6413c992a12 Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Tue, 31 Mar 2026 23:30:54 +0200 Subject: [PATCH 13/60] fix(hashing): fail-hard canonicalization, evaluator_identity() protocol Replace fragile repr() fallback with explicit error on unknown types. Add evaluator_identity() as user-controlled escape hatch for custom evaluators. Introspect dataclass/partial/callable as fallback only. - Remove hasattr(obj, "model_dump") duck-typing (Pydantic leak) - Remove default=str silent fallback in json.dumps - Skip _prefixed dataclass fields (runtime internals, not config) - Add functools.partial support (qualname + bound kwargs) - Add ShortCircuit.evaluator_identity() - 33 tests covering all paths including fail-hard --- protest/evals/evaluator.py | 16 ++- protest/evals/hashing.py | 82 ++++++++++--- tests/evals/test_hashing.py | 239 ++++++++++++++++++++++++++++++++++-- 3 files changed, 308 insertions(+), 29 deletions(-) diff --git a/protest/evals/evaluator.py b/protest/evals/evaluator.py index 701fe5c..7fdf6c7 100644 --- a/protest/evals/evaluator.py +++ b/protest/evals/evaluator.py @@ -48,7 +48,15 @@ async def llm_judge(ctx: EvalContext, model: str = "haiku") -> bool: @dataclass class EvalContext(Generic[InputT, OutputT]): - """Context passed to evaluator functions.""" + """Context passed to evaluator functions. + + Dual role: read-only DTO (inputs, output, expected) + mutable accumulator + for judge call stats (tokens, cost, call count). One instance per case, + shared sequentially across evaluators, discarded after scoring. + + Note: judge stats accumulate via ctx.judge() side-effects. If evaluators + are ever parallelized within a case, the accumulators will need isolation. + """ name: str inputs: InputT @@ -149,6 +157,12 @@ class ShortCircuit: def __init__(self, evaluators: list[Any]) -> None: self.evaluators = evaluators + def evaluator_identity(self) -> dict[str, Any]: + """Identity is the ordered list of inner evaluators.""" + from protest.evals.hashing import _canonical + + return {"short_circuit": [_canonical(e) for e in self.evaluators]} + class Metric: """Annotate a float/int field as a metric for stats aggregation.""" diff --git a/protest/evals/hashing.py b/protest/evals/hashing.py index 0f0f5e9..5ebe725 100644 --- a/protest/evals/hashing.py +++ b/protest/evals/hashing.py @@ -1,8 +1,18 @@ -"""Content hashing for eval cases — detect when cases or scoring change.""" +"""Content hashing for eval cases — detect when cases or scoring change. + +Hashes capture identity + configuration, not implementation. A renamed +parameter changes the hash; a rewritten function body does not. This is +a deliberate trade-off: we detect config drift, not code drift. + +Custom evaluators can implement ``evaluator_identity()`` to control +exactly what gets hashed. Built-in types (dataclass, functools.partial, +plain callable) are introspected automatically as a fallback. +""" from __future__ import annotations import dataclasses +import functools import hashlib import json from typing import Any @@ -10,42 +20,80 @@ HASH_LENGTH = 12 +class CanonicalError(TypeError): + """Raised when an object cannot be converted to a canonical form.""" + + def compute_case_hash(inputs: Any, expected_output: Any) -> str: """Hash the case content (inputs + expected_output).""" data = {"inputs": _canonical(inputs), "expected": _canonical(expected_output)} return _hash(data) -def compute_eval_hash( - evaluators: list[Any], -) -> str: +def compute_eval_hash(evaluators: list[Any]) -> str: """Hash the scoring config (evaluators only).""" - data = { - "evaluators": [_canonical(e) for e in evaluators], - } + data = {"evaluators": [_canonical(e) for e in evaluators]} return _hash(data) def _hash(data: Any) -> str: - raw = json.dumps(data, sort_keys=True, default=str) + raw = json.dumps(data, sort_keys=True) return hashlib.sha256(raw.encode()).hexdigest()[:HASH_LENGTH] -def _canonical(obj: Any) -> Any: - """Convert an object to a canonical JSON-serializable form.""" +def _canonical(obj: Any) -> Any: # noqa: PLR0911 + """Convert an object to a canonical JSON-serializable form. + + Resolution order: + 1. Primitives, list, tuple, dict — native support + 2. ``evaluator_identity()`` — explicit, user-controlled + 3. Dataclass / functools.partial / callable — introspection fallback + 4. Anything else → CanonicalError + """ + # --- primitives & containers --- if obj is None or isinstance(obj, (bool, int, float, str)): return obj if isinstance(obj, (list, tuple)): return [_canonical(item) for item in obj] if isinstance(obj, dict): return {str(k): _canonical(v) for k, v in sorted(obj.items())} - # Pydantic models - if hasattr(obj, "model_dump"): - return _canonical(obj.model_dump(mode="json")) - # Dataclasses — iterate without deepcopy to support non-picklable fields + + # --- explicit identity (user-controlled) --- + if hasattr(obj, "evaluator_identity"): + return _canonical(obj.evaluator_identity()) + + # --- introspection fallback --- + + # Dataclasses — public fields only (skip _ prefixed runtime internals) if dataclasses.is_dataclass(obj) and not isinstance(obj, type): return { - f.name: _canonical(getattr(obj, f.name)) for f in dataclasses.fields(obj) + "__type__": type(obj).__qualname__, + **{ + f.name: _canonical(getattr(obj, f.name)) + for f in dataclasses.fields(obj) + if not f.name.startswith("_") + }, + } + # functools.partial — qualname + bound kwargs + if isinstance(obj, functools.partial): + return { + "fn": _fn_qualname(obj.func), + "args": _canonical(list(obj.args)) if obj.args else [], + "kwargs": _canonical(dict(obj.keywords)) if obj.keywords else {}, } - # Fallback - return repr(obj) + # Plain callable — qualname only + if callable(obj): + qualname = _fn_qualname(obj) + if qualname is not None: + return {"fn": qualname} + + raise CanonicalError( + f"Cannot canonicalize {type(obj).__name__!r}. " + f"Implement evaluator_identity() or use a supported type " + f"(primitives, list, dict, dataclass, callable)." + ) + + +def _fn_qualname(fn: Any) -> str | None: + """Extract a stable qualified name from a callable.""" + return getattr(fn, "__qualname__", None) or getattr(fn, "__name__", None) diff --git a/tests/evals/test_hashing.py b/tests/evals/test_hashing.py index bc53e1f..26e5570 100644 --- a/tests/evals/test_hashing.py +++ b/tests/evals/test_hashing.py @@ -1,14 +1,22 @@ -"""Tests for protest.evals.hashing — including non-picklable dataclass fields.""" +"""Tests for protest.evals.hashing — fail-hard canonicalization.""" from __future__ import annotations import dataclasses +import functools import threading -from protest.evals.hashing import _canonical, compute_eval_hash +import pytest + +from protest.evals.hashing import ( + CanonicalError, + _canonical, + compute_case_hash, + compute_eval_hash, +) # --------------------------------------------------------------------------- -# _canonical — dataclass handling +# Fixtures — representative evaluator types # --------------------------------------------------------------------------- @@ -32,24 +40,218 @@ class LockHoldingEvaluator: _lock: threading.Lock = dataclasses.field(default_factory=threading.Lock) +def bare_function(ctx: object) -> bool: + return True + + +def parameterized_function(ctx: object, keywords: list[str]) -> bool: + return True + + +# --------------------------------------------------------------------------- +# _canonical — primitives & containers +# --------------------------------------------------------------------------- + + +class TestCanonicalPrimitives: + @pytest.mark.parametrize("value", [None, True, False, 42, 3.14, "hello"]) + def test_primitives_pass_through(self, value: object) -> None: + assert _canonical(value) is value + + def test_list(self) -> None: + assert _canonical([1, "a", [2]]) == [1, "a", [2]] + + def test_tuple_treated_as_list(self) -> None: + assert _canonical((1, 2)) == [1, 2] + + def test_dict_sorted_by_key(self) -> None: + assert _canonical({"b": 2, "a": 1}) == {"a": 1, "b": 2} + + +# --------------------------------------------------------------------------- +# _canonical — dataclass handling +# --------------------------------------------------------------------------- + + class TestCanonicalDataclass: def test_simple_dataclass_is_serialized(self) -> None: ev = SimpleEvaluator(threshold=0.8) result = _canonical(ev) - assert result == {"threshold": 0.8, "name": "simple"} + assert result == { + "__type__": "SimpleEvaluator", + "threshold": 0.8, + "name": "simple", + } def test_nested_dataclass_is_serialized_recursively(self) -> None: ev = NestedEvaluator(inner=SimpleEvaluator(threshold=0.5), weight=2.0) result = _canonical(ev) - assert result == {"inner": {"threshold": 0.5, "name": "simple"}, "weight": 2.0} + assert result == { + "__type__": "NestedEvaluator", + "inner": { + "__type__": "SimpleEvaluator", + "threshold": 0.5, + "name": "simple", + }, + "weight": 2.0, + } + + def test_dataclass_with_lock_skips_private_fields(self) -> None: + """Regression: dataclasses.asdict() deepcopy fails on threading.Lock. - def test_dataclass_with_lock_does_not_crash(self) -> None: - """Regression: dataclasses.asdict() deepcopy fails on threading.Lock.""" + Private fields (_prefixed) are runtime internals, not config — excluded from hash. + """ ev = LockHoldingEvaluator(name="llm_judge") - # Must not raise — lock falls back to repr() result = _canonical(ev) - assert result["name"] == "llm_judge" - assert "_lock" in result + assert result == {"__type__": "LockHoldingEvaluator", "name": "llm_judge"} + assert "_lock" not in result + + +# --------------------------------------------------------------------------- +# _canonical — callables (the real-world evaluator path) +# --------------------------------------------------------------------------- + + +class TestCanonicalCallable: + def test_bare_function(self) -> None: + result = _canonical(bare_function) + assert result == {"fn": "bare_function"} + + def test_partial_captures_qualname_and_kwargs(self) -> None: + bound = functools.partial(parameterized_function, keywords=["paris"]) + result = _canonical(bound) + assert result == { + "fn": "parameterized_function", + "args": [], + "kwargs": {"keywords": ["paris"]}, + } + + def test_partial_different_kwargs_different_canonical(self) -> None: + a = functools.partial(parameterized_function, keywords=["paris"]) + b = functools.partial(parameterized_function, keywords=["lyon"]) + assert _canonical(a) != _canonical(b) + + def test_partial_same_kwargs_same_canonical(self) -> None: + a = functools.partial(parameterized_function, keywords=["paris"]) + b = functools.partial(parameterized_function, keywords=["paris"]) + assert _canonical(a) == _canonical(b) + + +# --------------------------------------------------------------------------- +# _canonical — evaluator_identity (explicit, user-controlled) +# --------------------------------------------------------------------------- + + +class TestCanonicalEvaluatorIdentity: + def test_evaluator_identity_takes_precedence(self) -> None: + """evaluator_identity() is used over introspection when available.""" + + class CustomScorer: + def __init__(self, model: str, temperature: float): + self.model = model + self.temperature = temperature + self._client = object() # runtime state, not config + + def evaluator_identity(self) -> dict: + return {"model": self.model, "temperature": self.temperature} + + result = _canonical(CustomScorer(model="gpt-4", temperature=0.7)) + assert result == {"model": "gpt-4", "temperature": 0.7} + + def test_evaluator_identity_on_dataclass_overrides_introspection(self) -> None: + """evaluator_identity() wins even if the object is a dataclass.""" + + @dataclasses.dataclass + class VersionedEvaluator: + threshold: float + version: int = 1 + + def evaluator_identity(self) -> dict: + return {"v": self.version, "t": self.threshold} + + result = _canonical(VersionedEvaluator(threshold=0.8, version=2)) + assert result == {"v": 2, "t": 0.8} + + def test_evaluator_identity_different_config_different_hash(self) -> None: + class CustomScorer: + def __init__(self, model: str): + self.model = model + + def evaluator_identity(self) -> dict: + return {"model": self.model} + + h1 = compute_eval_hash([CustomScorer(model="gpt-4")]) + h2 = compute_eval_hash([CustomScorer(model="claude")]) + assert h1 != h2 + + def test_evaluator_identity_same_config_same_hash(self) -> None: + class CustomScorer: + def __init__(self, model: str): + self.model = model + + def evaluator_identity(self) -> dict: + return {"model": self.model} + + h1 = compute_eval_hash([CustomScorer(model="gpt-4")]) + h2 = compute_eval_hash([CustomScorer(model="gpt-4")]) + assert h1 == h2 + + +# --------------------------------------------------------------------------- +# _canonical — fail-hard on unknown types +# --------------------------------------------------------------------------- + + +class TestCanonicalFailHard: + def test_unknown_type_raises_canonical_error(self) -> None: + class Opaque: + pass + + with pytest.raises(CanonicalError, match="Opaque"): + _canonical(Opaque()) + + def test_non_callable_non_dataclass_raises(self) -> None: + with pytest.raises(CanonicalError): + _canonical(object()) + + def test_error_message_mentions_evaluator_identity(self) -> None: + class Opaque: + pass + + with pytest.raises(CanonicalError, match="evaluator_identity"): + _canonical(Opaque()) + + +# --------------------------------------------------------------------------- +# compute_case_hash +# --------------------------------------------------------------------------- + + +class TestComputeCaseHash: + def test_same_inputs_same_hash(self) -> None: + h1 = compute_case_hash("hello", "expected") + h2 = compute_case_hash("hello", "expected") + assert h1 == h2 + + def test_different_inputs_different_hash(self) -> None: + h1 = compute_case_hash("hello", "expected") + h2 = compute_case_hash("world", "expected") + assert h1 != h2 + + def test_none_expected_is_stable(self) -> None: + h1 = compute_case_hash("hello", None) + h2 = compute_case_hash("hello", None) + assert h1 == h2 + + def test_dict_inputs(self) -> None: + h1 = compute_case_hash({"q": "hello", "context": "world"}, "expected") + h2 = compute_case_hash({"context": "world", "q": "hello"}, "expected") + assert h1 == h2, "dict key order should not affect hash" + + +# --------------------------------------------------------------------------- +# compute_eval_hash +# --------------------------------------------------------------------------- class TestComputeEvalHash: @@ -67,6 +269,21 @@ def test_different_thresholds_produce_different_hashes(self) -> None: def test_evaluator_with_lock_does_not_crash(self) -> None: """Regression for non-picklable evaluator fields.""" ev = LockHoldingEvaluator(name="llm_judge") - # Should not raise TypeError about cannot pickle '_thread.lock' hash_val = compute_eval_hash([ev]) assert len(hash_val) == 12 + + def test_partial_evaluators_hash_stably(self) -> None: + ev = functools.partial(parameterized_function, keywords=["paris"]) + h1 = compute_eval_hash([ev]) + h2 = compute_eval_hash([ev]) + assert h1 == h2 + + def test_bare_function_evaluator(self) -> None: + h1 = compute_eval_hash([bare_function]) + h2 = compute_eval_hash([bare_function]) + assert h1 == h2 + + def test_different_partial_kwargs_different_hash(self) -> None: + ev_a = functools.partial(parameterized_function, keywords=["paris"]) + ev_b = functools.partial(parameterized_function, keywords=["lyon"]) + assert compute_eval_hash([ev_a]) != compute_eval_hash([ev_b]) From d7fbba375821d8cbf8aa4bea10fd6ea795f44dfa Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Tue, 31 Mar 2026 23:50:32 +0200 Subject: [PATCH 14/60] refactor: replace kind string literals with SuiteKind StrEnum Type-safe suite kind across the codebase. StrEnum keeps JSON compat (SuiteKind.EVAL == "eval") so no migration needed. --- protest/core/session.py | 5 +++-- protest/core/suite.py | 5 +++-- protest/entities/__init__.py | 2 ++ protest/entities/core.py | 9 ++++++++- protest/evals/history.py | 3 ++- protest/filters/kind.py | 7 ++++--- protest/history/plugin.py | 3 ++- tests/evals/test_e2e.py | 5 +++-- 8 files changed, 27 insertions(+), 12 deletions(-) diff --git a/protest/core/session.py b/protest/core/session.py index 7ea04f3..910d032 100644 --- a/protest/core/session.py +++ b/protest/core/session.py @@ -22,6 +22,7 @@ FixtureScope, Retry, Skip, + SuiteKind, TestRegistration, Xfail, normalize_retry, @@ -235,7 +236,7 @@ def decorator(func: FuncT) -> FuncT: suite = ProTestSuite( name=suite_name, tags=list(tags or []), - kind="eval", + kind=SuiteKind.EVAL, metadata=suite_meta, ) wrapper = make_eval_wrapper( @@ -372,7 +373,7 @@ def activate_plugins(self, ctx: PluginContext) -> None: self.register_plugin(instance) # Auto-wire eval support if any suite has kind="eval" - if any(s.kind == "eval" for s in self._suites): + if any(s.kind == SuiteKind.EVAL for s in self._suites): self._wire_eval_support() def _wire_eval_support(self) -> None: diff --git a/protest/core/suite.py b/protest/core/suite.py index 262d908..99b4fa2 100644 --- a/protest/core/suite.py +++ b/protest/core/suite.py @@ -14,6 +14,7 @@ FixtureRegistration, Retry, Skip, + SuiteKind, SuitePath, TestRegistration, Xfail, @@ -49,7 +50,7 @@ def __init__( # noqa: PLR0913 max_concurrency: int | None = None, tags: list[str] | None = None, description: str | None = None, - kind: str = "test", + kind: SuiteKind = SuiteKind.TEST, metadata: dict[str, Any] | None = None, ) -> None: if max_concurrency is not None and max_concurrency < 1: @@ -76,7 +77,7 @@ def description(self) -> str | None: return self._description @property - def kind(self) -> str: + def kind(self) -> SuiteKind: return self._kind @property diff --git a/protest/entities/__init__.py b/protest/entities/__init__.py index 30bd04e..3016ebb 100644 --- a/protest/entities/__init__.py +++ b/protest/entities/__init__.py @@ -4,6 +4,7 @@ FixtureMarker, FixtureRegistration, FixtureScope, + SuiteKind, TestItem, TestOutcome, TestRegistration, @@ -48,6 +49,7 @@ "SessionResult", "SessionSetupInfo", "Skip", + "SuiteKind", "SuitePath", "SuiteResult", "SuiteSetupInfo", diff --git a/protest/entities/core.py b/protest/entities/core.py index f5efa22..5a8c680 100644 --- a/protest/entities/core.py +++ b/protest/entities/core.py @@ -1,7 +1,7 @@ from __future__ import annotations from dataclasses import dataclass, field -from enum import Enum +from enum import Enum, StrEnum from typing import TYPE_CHECKING, Any, TypeAlias if TYPE_CHECKING: @@ -20,6 +20,13 @@ FixtureCallable: TypeAlias = "Callable[..., Any]" +class SuiteKind(StrEnum): + """Kind of suite — determines behavior (eval wiring, history, reporting).""" + + TEST = "test" + EVAL = "eval" + + class FixtureScope(Enum): """Scope level for fixtures.""" diff --git a/protest/evals/history.py b/protest/evals/history.py index b607566..725c5b3 100644 --- a/protest/evals/history.py +++ b/protest/evals/history.py @@ -6,6 +6,7 @@ from datetime import datetime, timezone from typing import TYPE_CHECKING, Any +from protest.entities import SuiteKind from protest.history.collector import collect_env_info, collect_git_info from protest.history.storage import DEFAULT_HISTORY_DIR, HISTORY_FILE, append_entry from protest.plugin import PluginBase @@ -52,7 +53,7 @@ def setup(self, session: ProTestSession) -> None: """Collect per-suite metadata from session.""" self._suite_metadata = {} for suite in session.suites: - if suite.kind == "eval": + if suite.kind == SuiteKind.EVAL: self._suite_metadata[suite.name] = suite.suite_metadata def on_eval_suite_end(self, report: EvalSuiteReport) -> None: diff --git a/protest/filters/kind.py b/protest/filters/kind.py index 859e7dd..076684a 100644 --- a/protest/filters/kind.py +++ b/protest/filters/kind.py @@ -4,6 +4,7 @@ from typing import TYPE_CHECKING +from protest.entities import SuiteKind from protest.plugin import PluginBase if TYPE_CHECKING: @@ -17,14 +18,14 @@ class KindFilterPlugin(PluginBase): name = "kind-filter" description = "Filter by suite kind" - def __init__(self, kind: str) -> None: + def __init__(self, kind: SuiteKind) -> None: self._kind = kind @classmethod def activate(cls, ctx: PluginContext) -> KindFilterPlugin | None: kind = ctx.get("kind_filter") if kind: - return cls(kind=kind) + return cls(kind=SuiteKind(kind)) return None def on_collection_finish(self, items: list[TestItem]) -> list[TestItem]: @@ -32,5 +33,5 @@ def on_collection_finish(self, items: list[TestItem]) -> list[TestItem]: def _matches(self, item: TestItem) -> bool: if item.suite is None: - return self._kind == "test" + return self._kind == SuiteKind.TEST return item.suite.kind == self._kind diff --git a/protest/history/plugin.py b/protest/history/plugin.py index e216fe6..c8a0f79 100644 --- a/protest/history/plugin.py +++ b/protest/history/plugin.py @@ -14,6 +14,7 @@ from pathlib import Path from protest.core.session import ProTestSession + from protest.entities import SuiteKind from protest.entities.events import SessionResult, TestResult from protest.plugin import PluginContext @@ -28,7 +29,7 @@ def __init__(self, history_dir: Path | None = None) -> None: self._history_dir = history_dir or DEFAULT_HISTORY_DIR self._history_file = self._history_dir / HISTORY_FILE self._suites: dict[str, dict[str, dict[str, Any]]] = {} - self._suite_kinds: dict[str, str] = {} + self._suite_kinds: dict[str, SuiteKind] = {} self._default_suite_name: str = "tests" self._history_enabled: bool = False self._metadata: dict[str, Any] = {} diff --git a/tests/evals/test_e2e.py b/tests/evals/test_e2e.py index 9bdaead..72ef8ff 100644 --- a/tests/evals/test_e2e.py +++ b/tests/evals/test_e2e.py @@ -23,6 +23,7 @@ from protest.core.collector import Collector from protest.core.runner import TestRunner from protest.core.suite import ProTestSuite +from protest.entities import SuiteKind from protest.evals import ( EvalContext, EvalSession, @@ -191,7 +192,7 @@ def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: def test_kind_filter_keeps_only_matching(self) -> None: test_suite = ProTestSuite("tests") - eval_suite = ProTestSuite("evals", kind="eval") + eval_suite = ProTestSuite("evals", kind=SuiteKind.EVAL) session = ProTestSession() @@ -210,7 +211,7 @@ def eval_one() -> None: assert len(items) == 2 # Filter to eval only - plugin = KindFilterPlugin(kind="eval") + plugin = KindFilterPlugin(kind=SuiteKind.EVAL) filtered = plugin.on_collection_finish(items) assert len(filtered) == 1 assert filtered[0].suite.kind == "eval" From 905d3c8308ef38339efd7c522b5a782b1b3d7a52 Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Wed, 1 Apr 2026 00:59:36 +0200 Subject: [PATCH 15/60] refactor: move lazy imports to top-level, remove PLC0415 per-file ignores MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 28 lazy imports in protest/, none resolving a real circular dependency. Moved all to top-level except justified cases (optional deps like rich, conditional wiring, and one true circular import in evals/__init__.py). Removed blanket PLC0415 per-file-ignores from pyproject.toml — remaining suppressions use inline noqa with justification. --- protest/api.py | 40 ++++++++++----------------------- protest/cli/history.py | 19 ++++++++-------- protest/cli/main.py | 26 +++++++++------------ protest/core/session.py | 15 +++++++------ protest/evals/__init__.py | 2 +- protest/evals/evaluator.py | 20 +++++++++++------ protest/evals/history.py | 16 +++++++------ protest/evals/results_writer.py | 5 +---- protest/evals/session.py | 3 +-- protest/evals/wrapper.py | 12 +++------- protest/history/collector.py | 2 +- pyproject.toml | 5 ----- 12 files changed, 68 insertions(+), 97 deletions(-) diff --git a/protest/api.py b/protest/api.py index a6c6f79..ce8c178 100644 --- a/protest/api.py +++ b/protest/api.py @@ -14,21 +14,27 @@ def test_example(): assert True success = run_session(session) - -Note: - This module uses lazy imports (PLC0415) to optimize startup time. - Users importing `from protest.api import run_session` shouldn't pay - the cost of loading the entire framework until they actually call it. """ from __future__ import annotations +import asyncio from typing import TYPE_CHECKING +from protest.core.collector import Collector +from protest.core.runner import TestRunner +from protest.core.suite import ( + ProTestSuite, # noqa: TC001 — used at runtime in list_tags +) +from protest.events.types import Event +from protest.filters.keyword import KeywordFilterPlugin +from protest.filters.suite import SuiteFilterPlugin +from protest.plugin import PluginBase, PluginContext +from protest.tags.plugin import TagFilterPlugin + if TYPE_CHECKING: from protest.core.session import ProTestSession from protest.entities import RunResult, TestItem - from protest.plugin import PluginContext def run_session( # noqa: PLR0913 - public API with many optional params @@ -69,10 +75,6 @@ def run_session( # noqa: PLR0913 - public API with many optional params Returns: RunResult with success status and interrupted flag. """ - from protest.core.runner import ( # noqa: PLC0415 - lazy import for startup perf - TestRunner, - ) - # Apply session-level settings from ctx or params if ctx is not None: if ctx.get("concurrency") is not None: @@ -91,10 +93,6 @@ def run_session( # noqa: PLR0913 - public API with many optional params # Build context from parameters if not provided if ctx is None: - from protest.plugin import ( # noqa: PLC0415 - lazy import for startup perf - PluginContext, - ) - ctx = PluginContext( args={ "last_failed": last_failed, @@ -136,16 +134,6 @@ def collect_tests( # noqa: PLR0913 - public API with many optional params Returns: List of collected TestItem objects. """ - # Lazy imports for startup performance - only load when function is called - import asyncio # noqa: PLC0415 - - from protest.core.collector import Collector # noqa: PLC0415 - from protest.events.types import Event # noqa: PLC0415 - from protest.filters.keyword import KeywordFilterPlugin # noqa: PLC0415 - from protest.filters.suite import SuiteFilterPlugin # noqa: PLC0415 - from protest.plugin import PluginBase, PluginContext # noqa: PLC0415 - from protest.tags.plugin import TagFilterPlugin # noqa: PLC0415 - # Build context from parameters if not provided if ctx is None: ctx = PluginContext( @@ -182,10 +170,6 @@ def list_tags(session: ProTestSession) -> set[str]: Returns: Set of all tag names declared on fixtures, suites, and tests. """ - from protest.core.suite import ( # noqa: PLC0415, TC001 - lazy import for startup perf - ProTestSuite, - ) - all_tags: set[str] = set() for fixture_reg in session.fixtures: diff --git a/protest/cli/history.py b/protest/cli/history.py index 33b230b..cb00787 100644 --- a/protest/cli/history.py +++ b/protest/cli/history.py @@ -4,8 +4,11 @@ import argparse import sys +from pathlib import Path from typing import Any +from protest.history.storage import clean_dirty, load_history + def handle_history_command(argv: list[str]) -> None: """Entry point for `protest history`.""" @@ -42,10 +45,6 @@ def handle_history_command(argv: list[str]) -> None: ) args = parser.parse_args(argv) - from pathlib import Path - - from protest.history.storage import clean_dirty, load_history - history_dir = Path(args.path) if args.path else None if args.clean_dirty: @@ -165,12 +164,12 @@ class _RichOutput(_Output): """Rich output with colors, tables, panels.""" def __init__(self) -> None: - from rich.console import Console + from rich.console import Console # noqa: PLC0415 — optional dep self.console = Console(highlight=False) def stats(self, entries: list[dict[str, Any]]) -> None: - from rich.table import Table + from rich.table import Table # noqa: PLC0415 — optional dep suites = _aggregate_suites(entries) if not suites: @@ -230,8 +229,8 @@ def runs(self, entries: list[dict[str, Any]]) -> None: self.console.print() def detail(self, entry: dict[str, Any]) -> None: - from rich.panel import Panel - from rich.text import Text + from rich.panel import Panel # noqa: PLC0415 — optional dep + from rich.text import Text # noqa: PLC0415 — optional dep kind = "EVAL" if entry.get("evals") else "TEST" git = entry.get("git") or {} @@ -284,8 +283,8 @@ def detail(self, entry: dict[str, Any]) -> None: ) def compare(self, current: dict[str, Any], previous: dict[str, Any]) -> None: - from rich.panel import Panel - from rich.text import Text + from rich.panel import Panel # noqa: PLC0415 — optional dep + from rich.text import Text # noqa: PLC0415 — optional dep cm = _get_display_model(current) pm = _get_display_model(previous) diff --git a/protest/cli/main.py b/protest/cli/main.py index 0ee6f2a..648fd26 100644 --- a/protest/cli/main.py +++ b/protest/cli/main.py @@ -4,10 +4,14 @@ import sys from typing import TYPE_CHECKING, Any +from protest.api import collect_tests, list_tags, run_session +from protest.core.session import ProTestSession +from protest.loader import LoadError, load_session, parse_target +from protest.plugin import PluginContext +from protest.reporting.verbosity import Verbosity + if TYPE_CHECKING: - from protest.core.session import ProTestSession from protest.entities import TestItem - from protest.plugin import PluginContext HELP_EPILOG = """ Examples: @@ -56,9 +60,6 @@ def _handle_tags_command() -> None: def _list_tags(target: str, app_dir: str, recursive: bool = False) -> None: """List all tags in a session.""" - from protest.api import collect_tests, list_tags - from protest.loader import LoadError, load_session - try: session = load_session(target, app_dir) except LoadError as exc: @@ -136,7 +137,7 @@ def _handle_live_command() -> None: ) args = parser.parse_args(sys.argv[2:]) - from protest.reporting.web import run_live_server + from protest.reporting.web import run_live_server # noqa: PLC0415 — optional dep run_live_server(port=args.port) @@ -234,15 +235,15 @@ def _create_run_parser() -> argparse.ArgumentParser: def _handle_history_command() -> None: """Handle 'protest history' subcommand.""" - from protest.cli.history import handle_history_command + from protest.cli.history import ( # noqa: PLC0415 — heavy module + handle_history_command, + ) handle_history_command(sys.argv[2:]) def _handle_run_command(kind_filter: str | None = None) -> None: """Handle 'protest run' / 'protest eval' with two-phase parsing.""" - from protest.loader import LoadError, load_session, parse_target - argv = sys.argv[2:] # Phase 1: Parse base args to get target @@ -251,8 +252,6 @@ def _handle_run_command(kind_filter: str | None = None) -> None: # If --help without target, show full help with all plugin options if ("--help" in remaining or "-h" in remaining) and not base_args.target: - from protest.core.session import ProTestSession - full_parser = _create_run_parser() for plugin_class in ProTestSession.default_plugin_classes(): plugin_class.add_cli_options(full_parser) @@ -282,9 +281,6 @@ def _handle_run_command(kind_filter: str | None = None) -> None: args = full_parser.parse_args(argv) # Phase 5: Build context - from protest.plugin import PluginContext - from protest.reporting.verbosity import Verbosity - effective_verbosity = Verbosity.QUIET if args.quiet else args.verbosity ctx_args: dict[str, Any] = { **vars(args), @@ -304,8 +300,6 @@ def run_tests( ctx: PluginContext, collect_only: bool = False, ) -> None: - from protest.api import collect_tests, run_session - if collect_only: items = collect_tests(session, ctx=ctx) print(f"Collected {len(items)} test(s):\n") diff --git a/protest/core/session.py b/protest/core/session.py index 910d032..efef4fb 100644 --- a/protest/core/session.py +++ b/protest/core/session.py @@ -8,13 +8,13 @@ from types import TracebackType from protest.compat import Self - from protest.core.suite import ProTestSuite from protest.entities import FixtureCallable from protest.evals.types import JudgeInfo, ModelInfo from protest.plugin import PluginBase, PluginContext from protest.cache.plugin import CachePlugin from protest.cache.storage import CacheStorage +from protest.core.suite import ProTestSuite from protest.di.container import FixtureContainer from protest.di.decorators import get_fixture_marker, unwrap_fixture from protest.entities import ( @@ -29,6 +29,9 @@ normalize_skip, normalize_xfail, ) +from protest.evals.history import EvalHistoryPlugin +from protest.evals.results_writer import EvalResultsWriter +from protest.evals.wrapper import make_eval_wrapper from protest.events.bus import EventBus from protest.events.types import Event from protest.exceptions import InvalidMaxConcurrencyError @@ -223,8 +226,6 @@ def eval( async def my_eval(case: Annotated[dict, From(cases)]) -> str: return await run(case["q"]) """ - from protest.core.suite import ProTestSuite - from protest.evals.wrapper import make_eval_wrapper def decorator(func: FuncT) -> FuncT: suite_name = name or func.__name__ @@ -331,7 +332,9 @@ def register_default_plugins(self) -> None: for plugin_class in self.default_plugin_classes(): self.use(plugin_class) if self._history: - from protest.history.plugin import HistoryPlugin + from protest.history.plugin import ( # noqa: PLC0415 — conditional + HistoryPlugin, + ) self.register_plugin(HistoryPlugin(history_dir=self._history_dir)) @@ -378,8 +381,6 @@ def activate_plugins(self, ctx: PluginContext) -> None: def _wire_eval_support(self) -> None: """Wire eval history + results writer plugins (no EvalPlugin).""" - from protest.evals.history import EvalHistoryPlugin - from protest.evals.results_writer import EvalResultsWriter judge_dict = None if self._eval_judge: @@ -451,7 +452,7 @@ async def __aexit__( exc_val: BaseException | None, exc_tb: TracebackType | None, ) -> bool: - import time + import time # noqa: PLC0415 — only needed in __aexit__ teardown_start = time.perf_counter() set_session_teardown_capture(True) diff --git a/protest/evals/__init__.py b/protest/evals/__init__.py index 628a275..d90b8f4 100644 --- a/protest/evals/__init__.py +++ b/protest/evals/__init__.py @@ -47,7 +47,7 @@ def __getattr__(name: str) -> object: # and reporters import protest.evals.types — eagerly importing # EvalSession here would create a circular import chain. if name == "EvalSession": - from protest.evals.session import EvalSession + from protest.evals.session import EvalSession # noqa: PLC0415 — circular import return EvalSession msg = f"module {__name__!r} has no attribute {name!r}" diff --git a/protest/evals/evaluator.py b/protest/evals/evaluator.py index 7fdf6c7..5d7a9f8 100644 --- a/protest/evals/evaluator.py +++ b/protest/evals/evaluator.py @@ -36,7 +36,19 @@ async def llm_judge(ctx: EvalContext, model: str = "haiku") -> bool: import functools import inspect from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any, Generic, TypeVar +from typing import ( + TYPE_CHECKING, + Annotated, + Any, + Generic, + TypeVar, + get_args, + get_origin, + get_type_hints, +) + +from protest.evals.hashing import _canonical +from protest.evals.types import EvalScore if TYPE_CHECKING: from protest.evals.types import Judge @@ -159,8 +171,6 @@ def __init__(self, evaluators: list[Any]) -> None: def evaluator_identity(self) -> dict[str, Any]: """Identity is the ordered list of inner evaluators.""" - from protest.evals.hashing import _canonical - return {"short_circuit": [_canonical(e) for e in self.evaluators]} @@ -186,10 +196,6 @@ def extract_scores_from_result(result: Any, evaluator_name: str) -> list[Any]: Raises: TypeError: If result is not bool or dataclass. """ - from typing import Annotated, get_args, get_origin, get_type_hints - - from protest.evals.types import EvalScore - if isinstance(result, bool): return [EvalScore(name=evaluator_name, value=result)] diff --git a/protest/evals/history.py b/protest/evals/history.py index 725c5b3..010ddb8 100644 --- a/protest/evals/history.py +++ b/protest/evals/history.py @@ -8,7 +8,13 @@ from protest.entities import SuiteKind from protest.history.collector import collect_env_info, collect_git_info -from protest.history.storage import DEFAULT_HISTORY_DIR, HISTORY_FILE, append_entry +from protest.history.storage import ( + DEFAULT_HISTORY_DIR, + HISTORY_FILE, + append_entry, + load_history, + load_previous_run, +) from protest.plugin import PluginBase if TYPE_CHECKING: @@ -75,8 +81,6 @@ def on_session_end(self, _result: Any) -> None: def load_entries(self, n: int | None = None) -> list[dict[str, Any]]: """Load entries from history file.""" - from protest.history.storage import load_history - return load_history(history_dir=self._history_dir, n=n, evals_only=True) @@ -156,8 +160,6 @@ def _serialize_case(case: EvalCaseResult) -> dict[str, Any]: return entry -def load_previous_run(history_dir: Any = None) -> dict[str, Any] | None: +def load_previous_eval_run(history_dir: Any = None) -> dict[str, Any] | None: """Load the most recent eval run from history.""" - from protest.history.storage import load_previous_run as _load - - return _load(history_dir=history_dir, evals_only=True) + return load_previous_run(history_dir=history_dir, evals_only=True) diff --git a/protest/evals/results_writer.py b/protest/evals/results_writer.py index 0c670a8..e069bba 100644 --- a/protest/evals/results_writer.py +++ b/protest/evals/results_writer.py @@ -11,11 +11,11 @@ from pathlib import Path from typing import TYPE_CHECKING, Any +from protest.evals.types import EvalCaseResult, EvalScore, EvalSuiteReport from protest.plugin import PluginBase if TYPE_CHECKING: from protest.entities.events import TestResult - from protest.evals.types import EvalCaseResult, EvalScore from protest.plugin import PluginContext DEFAULT_RESULTS_DIR = Path(".protest") / "results" @@ -57,7 +57,6 @@ def _write_case_file(self, case_result: EvalCaseResult, suite_name: str) -> None def on_eval_suite_end(self, report: Any) -> None: """Print results dir path for the suite.""" - from protest.evals.types import EvalSuiteReport if not isinstance(report, EvalSuiteReport): return @@ -68,8 +67,6 @@ def on_eval_suite_end(self, report: Any) -> None: def _build_case_result(result: TestResult, passed: bool) -> EvalCaseResult: """Build EvalCaseResult from a TestResult with eval_payload.""" - from protest.evals.types import EvalCaseResult, EvalScore - payload = result.eval_payload assert payload is not None return EvalCaseResult( diff --git a/protest/evals/session.py b/protest/evals/session.py index ddace3d..09f0d5c 100644 --- a/protest/evals/session.py +++ b/protest/evals/session.py @@ -5,6 +5,7 @@ from typing import TYPE_CHECKING, Any from protest.core.session import ProTestSession +from protest.evals.types import JudgeInfo if TYPE_CHECKING: from pathlib import Path @@ -43,8 +44,6 @@ def __init__( self._eval_model = model self._eval_judge_instance: Judge | None = judge if judge is not None: - from protest.evals.types import JudgeInfo - self._eval_judge = JudgeInfo(name=judge.name, provider=judge.provider) else: self._eval_judge = None diff --git a/protest/evals/wrapper.py b/protest/evals/wrapper.py index 82b21ad..9526acd 100644 --- a/protest/evals/wrapper.py +++ b/protest/evals/wrapper.py @@ -18,7 +18,9 @@ ShortCircuit, extract_scores_from_result, ) -from protest.evals.types import EvalScore +from protest.evals.hashing import compute_case_hash, compute_eval_hash +from protest.evals.types import EvalScore, TaskResult +from protest.exceptions import FixtureError def make_eval_wrapper( @@ -44,8 +46,6 @@ async def eval_wrapper(**kwargs: Any) -> EvalPayload: task_duration = time.perf_counter() - start # Unwrap TaskResult if returned - from protest.evals.types import TaskResult - task_input_tokens = 0 task_output_tokens = 0 task_cost = 0.0 @@ -72,8 +72,6 @@ async def eval_wrapper(**kwargs: Any) -> EvalPayload: judge=judge, ) - from protest.evals.hashing import compute_case_hash, compute_eval_hash - return EvalPayload( case_name=case_name, passed=all(s.passed for s in scores), @@ -203,8 +201,6 @@ async def run_evaluators( result = await raw if asyncio.iscoroutine(raw) else raw scores.extend(extract_scores_from_result(result, evaluator_name)) except Exception as exc: - from protest.exceptions import FixtureError - raise FixtureError(f"evaluator '{evaluator_name}'", exc) from exc return scores, ctx @@ -222,8 +218,6 @@ async def _run_short_circuit( raw = ev(ctx) result = await raw if asyncio.iscoroutine(raw) else raw except Exception as exc: - from protest.exceptions import FixtureError - raise FixtureError(f"evaluator '{evaluator_name}'", exc) from exc extracted = extract_scores_from_result(result, evaluator_name) scores.extend(extracted) diff --git a/protest/history/collector.py b/protest/history/collector.py index e81eefd..ee8bb1a 100644 --- a/protest/history/collector.py +++ b/protest/history/collector.py @@ -74,7 +74,7 @@ def _git(*args: str) -> str: def _get_pkg_version(name: str) -> str | None: try: - from importlib.metadata import version + from importlib.metadata import version # noqa: PLC0415 — inside try/except return version(name) except Exception: diff --git a/pyproject.toml b/pyproject.toml index 92373eb..c1da068 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -96,28 +96,23 @@ ignore = [ ] "protest/cli/**" = [ "T201", # print allowed in CLI - "PLC0415", # lazy imports for fast --help "PLR2004", # magic values for arg parsing ] "protest/core/session.py" = [ - "PLC0415", # lazy import for optional rich dependency "PLR0913", # many args is deliberate API design ] "protest/core/execution/test_executor.py" = [ "PLR0915", # _run_test is inherently complex (retry loop + eval capture) ] "protest/history/**" = [ - "PLC0415", # lazy imports "S603", # subprocess git calls are safe "PLR0913", # load_history has many filter params by design ] "protest/cli/history.py" = [ "T201", # print for CLI output - "PLC0415", # lazy imports ] "protest/evals/**" = [ "T201", # print for eval reporting - "PLC0415", # lazy imports for optional pydantic-evals dependency "PLR0913", # adapter functions have many params by design ] "protest/reporting/ascii.py" = [ From b703fa8f19afa7ab9e004de9c34f872d3b9e8deb Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Wed, 1 Apr 2026 01:29:20 +0200 Subject: [PATCH 16/60] fix: resolve all 32 mypy errors, type EvalContext generics properly - Type built-in evaluators as EvalContext[Any, str] (text evaluators) - not_empty typed EvalContext[Any, Any] (works on any output) - Fix mypy running outside venv (uv run mypy in justfile) - Add mypy config in pyproject.toml with rich stubs override - Fix no-any-return, arg-type, unused type-ignore across codebase - Remove stale type: ignore[import-not-found] on rich imports --- justfile | 2 +- protest/cli/history.py | 4 ++-- protest/console.py | 2 +- protest/core/outcome.py | 6 ++++-- protest/evals/evaluator.py | 2 +- protest/evals/evaluators.py | 20 ++++++++++---------- protest/evals/wrapper.py | 2 +- protest/history/storage.py | 2 +- protest/reporting/rich_reporter.py | 4 ++-- pyproject.toml | 7 +++++++ 10 files changed, 30 insertions(+), 21 deletions(-) diff --git a/justfile b/justfile index ddce526..9ddfe7b 100644 --- a/justfile +++ b/justfile @@ -7,7 +7,7 @@ @lint: ruff format . ruff check --fix . - mypy --strict protest + uv run mypy protest @fullcheck: ruff format --check . && ruff check . # lint diff --git a/protest/cli/history.py b/protest/cli/history.py index cb00787..e83216d 100644 --- a/protest/cli/history.py +++ b/protest/cli/history.py @@ -447,8 +447,8 @@ def _track_cases(suite: dict[str, Any], cases: dict[str, Any]) -> None: def _get_display_model(entry: dict[str, Any]) -> str: """Get display model: per-suite models if they differ, global otherwise.""" - suite_models = { - sd.get("model") + suite_models: set[str] = { + sd["model"] for sd in entry.get("suites", {}).values() if isinstance(sd, dict) and sd.get("model") } diff --git a/protest/console.py b/protest/console.py index 29dd381..9959165 100644 --- a/protest/console.py +++ b/protest/console.py @@ -44,7 +44,7 @@ def print(msg: str, *, raw: bool = False) -> None: # Call handlers directly (sync, bypasses async emit). # This ensures messages appear immediately, not after the test. - for handler_entry in bus._handlers.get(Event.USER_PRINT, []): # type: ignore[union-attr] + for handler_entry in bus._handlers.get(Event.USER_PRINT, []): # type: ignore[attr-defined] with contextlib.suppress(Exception): handler_entry.func((msg, raw)) diff --git a/protest/core/outcome.py b/protest/core/outcome.py index 0018812..2563d95 100644 --- a/protest/core/outcome.py +++ b/protest/core/outcome.py @@ -111,8 +111,10 @@ def _build_skip(self, er: TestExecutionResult) -> TestOutcome: def _build_pass(self, er: TestExecutionResult) -> TestOutcome: return TestOutcome( - TestResult(**self._base_kwargs(er)), TestCounts(passed=1), Event.TEST_PASS - ) # type: ignore[arg-type] + TestResult(**self._base_kwargs(er)), # type: ignore[arg-type] + TestCounts(passed=1), + Event.TEST_PASS, + ) def _build_xpass(self, er: TestExecutionResult) -> TestOutcome: kw = self._base_kwargs(er) diff --git a/protest/evals/evaluator.py b/protest/evals/evaluator.py index 5d7a9f8..fac20ed 100644 --- a/protest/evals/evaluator.py +++ b/protest/evals/evaluator.py @@ -260,7 +260,7 @@ def wrapper(*args: Any, **kwargs: Any) -> Any: def is_async_evaluator(fn: Any) -> bool: """Check if an evaluator (or partial thereof) is async.""" if hasattr(fn, "_is_async_evaluator"): - return fn._is_async_evaluator + return bool(fn._is_async_evaluator) if isinstance(fn, functools.partial): return asyncio.iscoroutinefunction(fn.func) return asyncio.iscoroutinefunction(fn) diff --git a/protest/evals/evaluators.py b/protest/evals/evaluators.py index d2cd632..ec7d9bd 100644 --- a/protest/evals/evaluators.py +++ b/protest/evals/evaluators.py @@ -10,7 +10,7 @@ import json as json_module import re from dataclasses import dataclass -from typing import Annotated +from typing import Annotated, Any from protest.evals.evaluator import EvalContext, Metric, Verdict, evaluator @@ -45,7 +45,7 @@ class WordOverlapResult: @evaluator def contains_keywords( - ctx: EvalContext, keywords: list[str], min_recall: float = 0.0 + ctx: EvalContext[Any, str], keywords: list[str], min_recall: float = 0.0 ) -> ContainsKeywordsResult: """Check that the output contains expected keywords (case-insensitive).""" output_lower = ctx.output.lower() @@ -59,7 +59,7 @@ def contains_keywords( @evaluator -def contains_expected(ctx: EvalContext, case_sensitive: bool = False) -> bool: +def contains_expected(ctx: EvalContext[Any, str], case_sensitive: bool = False) -> bool: """Check that the output contains expected_output as a substring.""" if ctx.expected_output is None: return True @@ -70,7 +70,7 @@ def contains_expected(ctx: EvalContext, case_sensitive: bool = False) -> bool: @evaluator def does_not_contain( - ctx: EvalContext, forbidden: list[str], case_sensitive: bool = False + ctx: EvalContext[Any, str], forbidden: list[str], case_sensitive: bool = False ) -> DoesNotContainResult: """Check that the output does not contain forbidden words.""" output = ctx.output if case_sensitive else ctx.output.lower() @@ -79,7 +79,7 @@ def does_not_contain( @evaluator -def not_empty(ctx: EvalContext) -> bool: +def not_empty(ctx: EvalContext[Any, Any]) -> bool: """Check that the output is not empty or whitespace-only.""" if ctx.output is None: return False @@ -89,7 +89,7 @@ def not_empty(ctx: EvalContext) -> bool: @evaluator -def max_length(ctx: EvalContext, max_chars: int = 500) -> MaxLengthResult: +def max_length(ctx: EvalContext[Any, str], max_chars: int = 500) -> MaxLengthResult: """Check that the output doesn't exceed a character limit.""" length = len(ctx.output) return MaxLengthResult( @@ -99,20 +99,20 @@ def max_length(ctx: EvalContext, max_chars: int = 500) -> MaxLengthResult: @evaluator -def min_length(ctx: EvalContext, min_chars: int = 1) -> bool: +def min_length(ctx: EvalContext[Any, str], min_chars: int = 1) -> bool: """Check that the output meets a minimum length.""" return len(ctx.output) >= min_chars @evaluator -def matches_regex(ctx: EvalContext, pattern: str, flags: int = 0) -> bool: +def matches_regex(ctx: EvalContext[Any, str], pattern: str, flags: int = 0) -> bool: """Check that the output matches a regex pattern.""" return bool(re.search(pattern, ctx.output, flags)) @evaluator def json_valid( - ctx: EvalContext, required_keys: list[str] | None = None + ctx: EvalContext[Any, str], required_keys: list[str] | None = None ) -> JsonValidResult: """Check that the output is valid JSON, optionally with required keys.""" if required_keys is None: @@ -131,7 +131,7 @@ def json_valid( @evaluator -def word_overlap(ctx: EvalContext) -> WordOverlapResult: +def word_overlap(ctx: EvalContext[Any, str]) -> WordOverlapResult: """Compute word overlap between output and expected_output (tracking-only).""" if ctx.expected_output is None: return WordOverlapResult(overlap=1.0) diff --git a/protest/evals/wrapper.py b/protest/evals/wrapper.py index 9526acd..bc2569b 100644 --- a/protest/evals/wrapper.py +++ b/protest/evals/wrapper.py @@ -134,7 +134,7 @@ def _extract_case_name(kwargs: dict[str, Any], fallback: str) -> str: if _is_case_data(v): name = _get(v, "name") if name: - return name + return str(name) return fallback diff --git a/protest/history/storage.py b/protest/history/storage.py index 78d35b9..5dbe047 100644 --- a/protest/history/storage.py +++ b/protest/history/storage.py @@ -90,7 +90,7 @@ def load_previous_run( continue if evals_only and entry.get("evals") is None: continue - return entry + return dict(entry) return None diff --git a/protest/reporting/rich_reporter.py b/protest/reporting/rich_reporter.py index 981b03f..506641d 100644 --- a/protest/reporting/rich_reporter.py +++ b/protest/reporting/rich_reporter.py @@ -5,8 +5,8 @@ from pathlib import Path from typing import Any -from rich.console import Console # type: ignore[import-not-found] -from rich.table import Table # type: ignore[import-not-found] +from rich.console import Console +from rich.table import Table from typing_extensions import Self from protest.entities import ( diff --git a/pyproject.toml b/pyproject.toml index c1da068..ff175b4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -135,6 +135,13 @@ omit = [ "protest/compat.py", # Version-specific imports, impossible to cover without multi-version CI ] +[tool.mypy] +strict = true + +[[tool.mypy.overrides]] +module = "rich.*" +ignore_missing_imports = true + [tool.pytest.ini_options] testpaths = ["tests"] asyncio_mode = "strict" From 39bd555c777e11a7d799daa47c5e3c2be441dcfe Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Wed, 1 Apr 2026 01:34:55 +0200 Subject: [PATCH 17/60] refactor: remove dead duck-typed evaluator markers, add typed examples MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove is_async_evaluator(), _is_evaluator, _is_async_evaluator (written but never read — dead code with hasattr duck-typing) - Add yorkshire example evaluators showing EvalContext generics: [Any, str] for text, [str, float] for numeric, [str, bytes] for binary --- examples/yorkshire/evals/evaluators.py | 60 ++++++++++++++++++++++++++ protest/evals/evaluator.py | 13 ------ 2 files changed, 60 insertions(+), 13 deletions(-) diff --git a/examples/yorkshire/evals/evaluators.py b/examples/yorkshire/evals/evaluators.py index b07153d..1008c22 100644 --- a/examples/yorkshire/evals/evaluators.py +++ b/examples/yorkshire/evals/evaluators.py @@ -2,4 +2,64 @@ Generic evaluators come from protest.evals.evaluators. Only project-specific ones live here. + +These also demonstrate how EvalContext generics document +what an evaluator expects as input/output types. """ + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Annotated, Any + +from protest.evals import EvalContext, Metric, Verdict, evaluator + +# --- Text evaluator: EvalContext[Any, str] --------------------------------- +# Most evaluators work on text output. The first type param (inputs) is Any +# because evaluators don't usually care about the input shape. + + +@dataclass(frozen=True, slots=True) +class MentionsBreedResult: + breed_mentioned: Annotated[bool, Verdict] + + +@evaluator +def mentions_breed( + ctx: EvalContext[Any, str], breed: str = "Yorkshire" +) -> MentionsBreedResult: + """Check that the output mentions a specific breed.""" + return MentionsBreedResult(breed_mentioned=breed.lower() in ctx.output.lower()) + + +# --- Numeric evaluator: EvalContext[str, float] ---------------------------- +# An evaluator for a task that returns a numeric score (e.g. a classifier +# confidence, a similarity metric). The output is a float, not a string. + + +@dataclass(frozen=True, slots=True) +class ConfidenceResult: + confidence: Annotated[float, Metric] + above_threshold: Annotated[bool, Verdict] + + +@evaluator +def confidence_above( + ctx: EvalContext[str, float], threshold: float = 0.8 +) -> ConfidenceResult: + """Check that a numeric output (e.g. classifier confidence) meets a threshold.""" + return ConfidenceResult( + confidence=ctx.output, + above_threshold=ctx.output >= threshold, + ) + + +# --- Binary evaluator: EvalContext[str, bytes] ----------------------------- +# An evaluator for a task that returns raw bytes (e.g. image generation, +# audio synthesis). The evaluator checks basic properties of the output. + + +@evaluator +def output_not_empty_bytes(ctx: EvalContext[str, bytes]) -> bool: + """Check that a binary output (e.g. generated image) is not empty.""" + return len(ctx.output) > 0 diff --git a/protest/evals/evaluator.py b/protest/evals/evaluator.py index fac20ed..6d0c980 100644 --- a/protest/evals/evaluator.py +++ b/protest/evals/evaluator.py @@ -31,7 +31,6 @@ async def llm_judge(ctx: EvalContext, model: str = "haiku") -> bool: from __future__ import annotations -import asyncio import dataclasses import functools import inspect @@ -243,7 +242,6 @@ def wrapper(*args: Any, **kwargs: Any) -> Any: if has_extra_params and kwargs: bound = functools.partial(fn, **kwargs) # Preserve async detection on the partial - bound._is_async_evaluator = asyncio.iscoroutinefunction(fn) # type: ignore[attr-defined] bound.__name__ = fn.__name__ # type: ignore[attr-defined] bound.__qualname__ = fn.__qualname__ # type: ignore[attr-defined] return bound @@ -252,15 +250,4 @@ def wrapper(*args: Any, **kwargs: Any) -> Any: return fn return fn(*args, **kwargs) - wrapper._is_evaluator = True # type: ignore[attr-defined] - wrapper._is_async_evaluator = asyncio.iscoroutinefunction(fn) # type: ignore[attr-defined] return wrapper - - -def is_async_evaluator(fn: Any) -> bool: - """Check if an evaluator (or partial thereof) is async.""" - if hasattr(fn, "_is_async_evaluator"): - return bool(fn._is_async_evaluator) - if isinstance(fn, functools.partial): - return asyncio.iscoroutinefunction(fn.func) - return asyncio.iscoroutinefunction(fn) From 155db22114d855db5e4fae61eca797f7699379db Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Wed, 1 Apr 2026 01:43:12 +0200 Subject: [PATCH 18/60] ci: update workflow to install dependencies and fix mypy invocation --- .github/workflows/ci.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 75efa11..84a54c7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -46,8 +46,11 @@ jobs: with: python-version: "3.12" + - name: Install dependencies + run: uv sync --all-extras + - name: Type check - run: uvx mypy --strict protest + run: uv run mypy protest test: needs: lint From 96d3632c03b6c05cf4fd03fd9ad493eed801f538 Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Wed, 1 Apr 2026 01:47:56 +0200 Subject: [PATCH 19/60] refactor: remove redundant type ignores, update dependency management - Removed unnecessary `# type: ignore[import-not-found]` markers on imports. - Added `--group dev` flag to dependency sync in CI workflow. - Updated `uv.lock` to include new packages: `librt` and `mypy`. --- .github/workflows/ci.yml | 2 +- protest/reporting/factory.py | 2 +- protest/reporting/web.py | 8 +- pyproject.toml | 5 +- uv.lock | 160 ++++++++++++++++++++++++++++++++++- 5 files changed, 164 insertions(+), 13 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 84a54c7..22a0944 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -47,7 +47,7 @@ jobs: python-version: "3.12" - name: Install dependencies - run: uv sync --all-extras + run: uv sync --all-extras --group dev - name: Type check run: uv run mypy protest diff --git a/protest/reporting/factory.py b/protest/reporting/factory.py index e3d405a..6d0fbf6 100644 --- a/protest/reporting/factory.py +++ b/protest/reporting/factory.py @@ -18,7 +18,7 @@ def get_reporter(force_no_color: bool = False) -> PluginBase: return AsciiReporter() try: - from rich.console import Console # type: ignore[import-not-found] + from rich.console import Console Console() except ImportError: diff --git a/protest/reporting/web.py b/protest/reporting/web.py index 2e47b5d..517de24 100644 --- a/protest/reporting/web.py +++ b/protest/reporting/web.py @@ -30,12 +30,12 @@ ) try: - from websockets.asyncio.server import ( # type: ignore[import-not-found] + from websockets.asyncio.server import ( serve as ws_serve, ) - from websockets.datastructures import Headers # type: ignore[import-not-found] - from websockets.http11 import Request, Response # type: ignore[import-not-found] - from websockets.sync.client import ( # type: ignore[import-not-found] + from websockets.datastructures import Headers + from websockets.http11 import Request, Response + from websockets.sync.client import ( connect as ws_connect, ) except ImportError as err: # pragma: no cover diff --git a/pyproject.toml b/pyproject.toml index ff175b4..0dbe858 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -138,10 +138,6 @@ omit = [ [tool.mypy] strict = true -[[tool.mypy.overrides]] -module = "rich.*" -ignore_missing_imports = true - [tool.pytest.ini_options] testpaths = ["tests"] asyncio_mode = "strict" @@ -174,6 +170,7 @@ include = ["protest*"] dev = [ "jsonschema>=4.0.0", "mkdocs-material>=9.7.0", + "mypy>=1.0", "pre-commit>=4.5.0", "pytest>=9.0.1", "pytest-asyncio>=1.3.0", diff --git a/uv.lock b/uv.lock index 34a6ee8..e4d7032 100644 --- a/uv.lock +++ b/uv.lock @@ -477,6 +477,91 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/41/45/1a4ed80516f02155c51f51e8cedb3c1902296743db0bbc66608a0db2814f/jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe", size = 18437, upload-time = "2025-09-08T01:34:57.871Z" }, ] +[[package]] +name = "librt" +version = "0.8.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/56/9c/b4b0c54d84da4a94b37bd44151e46d5e583c9534c7e02250b961b1b6d8a8/librt-0.8.1.tar.gz", hash = "sha256:be46a14693955b3bd96014ccbdb8339ee8c9346fbe11c1b78901b55125f14c73", size = 177471, upload-time = "2026-02-17T16:13:06.101Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7c/5f/63f5fa395c7a8a93558c0904ba8f1c8d1b997ca6a3de61bc7659970d66bf/librt-0.8.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:81fd938344fecb9373ba1b155968c8a329491d2ce38e7ddb76f30ffb938f12dc", size = 65697, upload-time = "2026-02-17T16:11:06.903Z" }, + { url = "https://files.pythonhosted.org/packages/ff/e0/0472cf37267b5920eff2f292ccfaede1886288ce35b7f3203d8de00abfe6/librt-0.8.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5db05697c82b3a2ec53f6e72b2ed373132b0c2e05135f0696784e97d7f5d48e7", size = 68376, upload-time = "2026-02-17T16:11:08.395Z" }, + { url = "https://files.pythonhosted.org/packages/c8/be/8bd1359fdcd27ab897cd5963294fa4a7c83b20a8564678e4fd12157e56a5/librt-0.8.1-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:d56bc4011975f7460bea7b33e1ff425d2f1adf419935ff6707273c77f8a4ada6", size = 197084, upload-time = "2026-02-17T16:11:09.774Z" }, + { url = "https://files.pythonhosted.org/packages/e2/fe/163e33fdd091d0c2b102f8a60cc0a61fd730ad44e32617cd161e7cd67a01/librt-0.8.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5cdc0f588ff4b663ea96c26d2a230c525c6fc62b28314edaaaca8ed5af931ad0", size = 207337, upload-time = "2026-02-17T16:11:11.311Z" }, + { url = "https://files.pythonhosted.org/packages/01/99/f85130582f05dcf0c8902f3d629270231d2f4afdfc567f8305a952ac7f14/librt-0.8.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:97c2b54ff6717a7a563b72627990bec60d8029df17df423f0ed37d56a17a176b", size = 219980, upload-time = "2026-02-17T16:11:12.499Z" }, + { url = "https://files.pythonhosted.org/packages/6f/54/cb5e4d03659e043a26c74e08206412ac9a3742f0477d96f9761a55313b5f/librt-0.8.1-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:8f1125e6bbf2f1657d9a2f3ccc4a2c9b0c8b176965bb565dd4d86be67eddb4b6", size = 212921, upload-time = "2026-02-17T16:11:14.484Z" }, + { url = "https://files.pythonhosted.org/packages/b1/81/a3a01e4240579c30f3487f6fed01eb4bc8ef0616da5b4ebac27ca19775f3/librt-0.8.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:8f4bb453f408137d7581be309b2fbc6868a80e7ef60c88e689078ee3a296ae71", size = 221381, upload-time = "2026-02-17T16:11:17.459Z" }, + { url = "https://files.pythonhosted.org/packages/08/b0/fc2d54b4b1c6fb81e77288ff31ff25a2c1e62eaef4424a984f228839717b/librt-0.8.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:c336d61d2fe74a3195edc1646d53ff1cddd3a9600b09fa6ab75e5514ba4862a7", size = 216714, upload-time = "2026-02-17T16:11:19.197Z" }, + { url = "https://files.pythonhosted.org/packages/96/96/85daa73ffbd87e1fb287d7af6553ada66bf25a2a6b0de4764344a05469f6/librt-0.8.1-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:eb5656019db7c4deacf0c1a55a898c5bb8f989be904597fcb5232a2f4828fa05", size = 214777, upload-time = "2026-02-17T16:11:20.443Z" }, + { url = "https://files.pythonhosted.org/packages/12/9c/c3aa7a2360383f4bf4f04d98195f2739a579128720c603f4807f006a4225/librt-0.8.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c25d9e338d5bed46c1632f851babf3d13c78f49a225462017cf5e11e845c5891", size = 237398, upload-time = "2026-02-17T16:11:22.083Z" }, + { url = "https://files.pythonhosted.org/packages/61/19/d350ea89e5274665185dabc4bbb9c3536c3411f862881d316c8b8e00eb66/librt-0.8.1-cp310-cp310-win32.whl", hash = "sha256:aaab0e307e344cb28d800957ef3ec16605146ef0e59e059a60a176d19543d1b7", size = 54285, upload-time = "2026-02-17T16:11:23.27Z" }, + { url = "https://files.pythonhosted.org/packages/4f/d6/45d587d3d41c112e9543a0093d883eb57a24a03e41561c127818aa2a6bcc/librt-0.8.1-cp310-cp310-win_amd64.whl", hash = "sha256:56e04c14b696300d47b3bc5f1d10a00e86ae978886d0cee14e5714fafb5df5d2", size = 61352, upload-time = "2026-02-17T16:11:24.207Z" }, + { url = "https://files.pythonhosted.org/packages/1d/01/0e748af5e4fee180cf7cd12bd12b0513ad23b045dccb2a83191bde82d168/librt-0.8.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:681dc2451d6d846794a828c16c22dc452d924e9f700a485b7ecb887a30aad1fd", size = 65315, upload-time = "2026-02-17T16:11:25.152Z" }, + { url = "https://files.pythonhosted.org/packages/9d/4d/7184806efda571887c798d573ca4134c80ac8642dcdd32f12c31b939c595/librt-0.8.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a3b4350b13cc0e6f5bec8fa7caf29a8fb8cdc051a3bae45cfbfd7ce64f009965", size = 68021, upload-time = "2026-02-17T16:11:26.129Z" }, + { url = "https://files.pythonhosted.org/packages/ae/88/c3c52d2a5d5101f28d3dc89298444626e7874aa904eed498464c2af17627/librt-0.8.1-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:ac1e7817fd0ed3d14fd7c5df91daed84c48e4c2a11ee99c0547f9f62fdae13da", size = 194500, upload-time = "2026-02-17T16:11:27.177Z" }, + { url = "https://files.pythonhosted.org/packages/d6/5d/6fb0a25b6a8906e85b2c3b87bee1d6ed31510be7605b06772f9374ca5cb3/librt-0.8.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:747328be0c5b7075cde86a0e09d7a9196029800ba75a1689332348e998fb85c0", size = 205622, upload-time = "2026-02-17T16:11:28.242Z" }, + { url = "https://files.pythonhosted.org/packages/b2/a6/8006ae81227105476a45691f5831499e4d936b1c049b0c1feb17c11b02d1/librt-0.8.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f0af2bd2bc204fa27f3d6711d0f360e6b8c684a035206257a81673ab924aa11e", size = 218304, upload-time = "2026-02-17T16:11:29.344Z" }, + { url = "https://files.pythonhosted.org/packages/ee/19/60e07886ad16670aae57ef44dada41912c90906a6fe9f2b9abac21374748/librt-0.8.1-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:d480de377f5b687b6b1bc0c0407426da556e2a757633cc7e4d2e1a057aa688f3", size = 211493, upload-time = "2026-02-17T16:11:30.445Z" }, + { url = "https://files.pythonhosted.org/packages/9c/cf/f666c89d0e861d05600438213feeb818c7514d3315bae3648b1fc145d2b6/librt-0.8.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d0ee06b5b5291f609ddb37b9750985b27bc567791bc87c76a569b3feed8481ac", size = 219129, upload-time = "2026-02-17T16:11:32.021Z" }, + { url = "https://files.pythonhosted.org/packages/8f/ef/f1bea01e40b4a879364c031476c82a0dc69ce068daad67ab96302fed2d45/librt-0.8.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:9e2c6f77b9ad48ce5603b83b7da9ee3e36b3ab425353f695cba13200c5d96596", size = 213113, upload-time = "2026-02-17T16:11:33.192Z" }, + { url = "https://files.pythonhosted.org/packages/9b/80/cdab544370cc6bc1b72ea369525f547a59e6938ef6863a11ab3cd24759af/librt-0.8.1-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:439352ba9373f11cb8e1933da194dcc6206daf779ff8df0ed69c5e39113e6a99", size = 212269, upload-time = "2026-02-17T16:11:34.373Z" }, + { url = "https://files.pythonhosted.org/packages/9d/9c/48d6ed8dac595654f15eceab2035131c136d1ae9a1e3548e777bb6dbb95d/librt-0.8.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:82210adabbc331dbb65d7868b105185464ef13f56f7f76688565ad79f648b0fe", size = 234673, upload-time = "2026-02-17T16:11:36.063Z" }, + { url = "https://files.pythonhosted.org/packages/16/01/35b68b1db517f27a01be4467593292eb5315def8900afad29fabf56304ba/librt-0.8.1-cp311-cp311-win32.whl", hash = "sha256:52c224e14614b750c0a6d97368e16804a98c684657c7518752c356834fff83bb", size = 54597, upload-time = "2026-02-17T16:11:37.544Z" }, + { url = "https://files.pythonhosted.org/packages/71/02/796fe8f02822235966693f257bf2c79f40e11337337a657a8cfebba5febc/librt-0.8.1-cp311-cp311-win_amd64.whl", hash = "sha256:c00e5c884f528c9932d278d5c9cbbea38a6b81eb62c02e06ae53751a83a4d52b", size = 61733, upload-time = "2026-02-17T16:11:38.691Z" }, + { url = "https://files.pythonhosted.org/packages/28/ad/232e13d61f879a42a4e7117d65e4984bb28371a34bb6fb9ca54ec2c8f54e/librt-0.8.1-cp311-cp311-win_arm64.whl", hash = "sha256:f7cdf7f26c2286ffb02e46d7bac56c94655540b26347673bea15fa52a6af17e9", size = 52273, upload-time = "2026-02-17T16:11:40.308Z" }, + { url = "https://files.pythonhosted.org/packages/95/21/d39b0a87ac52fc98f621fb6f8060efb017a767ebbbac2f99fbcbc9ddc0d7/librt-0.8.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:a28f2612ab566b17f3698b0da021ff9960610301607c9a5e8eaca62f5e1c350a", size = 66516, upload-time = "2026-02-17T16:11:41.604Z" }, + { url = "https://files.pythonhosted.org/packages/69/f1/46375e71441c43e8ae335905e069f1c54febee63a146278bcee8782c84fd/librt-0.8.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:60a78b694c9aee2a0f1aaeaa7d101cf713e92e8423a941d2897f4fa37908dab9", size = 68634, upload-time = "2026-02-17T16:11:43.268Z" }, + { url = "https://files.pythonhosted.org/packages/0a/33/c510de7f93bf1fa19e13423a606d8189a02624a800710f6e6a0a0f0784b3/librt-0.8.1-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:758509ea3f1eba2a57558e7e98f4659d0ea7670bff49673b0dde18a3c7e6c0eb", size = 198941, upload-time = "2026-02-17T16:11:44.28Z" }, + { url = "https://files.pythonhosted.org/packages/dd/36/e725903416409a533d92398e88ce665476f275081d0d7d42f9c4951999e5/librt-0.8.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:039b9f2c506bd0ab0f8725aa5ba339c6f0cd19d3b514b50d134789809c24285d", size = 209991, upload-time = "2026-02-17T16:11:45.462Z" }, + { url = "https://files.pythonhosted.org/packages/30/7a/8d908a152e1875c9f8eac96c97a480df425e657cdb47854b9efaa4998889/librt-0.8.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5bb54f1205a3a6ab41a6fd71dfcdcbd278670d3a90ca502a30d9da583105b6f7", size = 224476, upload-time = "2026-02-17T16:11:46.542Z" }, + { url = "https://files.pythonhosted.org/packages/a8/b8/a22c34f2c485b8903a06f3fe3315341fe6876ef3599792344669db98fcff/librt-0.8.1-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:05bd41cdee35b0c59c259f870f6da532a2c5ca57db95b5f23689fcb5c9e42440", size = 217518, upload-time = "2026-02-17T16:11:47.746Z" }, + { url = "https://files.pythonhosted.org/packages/79/6f/5c6fea00357e4f82ba44f81dbfb027921f1ab10e320d4a64e1c408d035d9/librt-0.8.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:adfab487facf03f0d0857b8710cf82d0704a309d8ffc33b03d9302b4c64e91a9", size = 225116, upload-time = "2026-02-17T16:11:49.298Z" }, + { url = "https://files.pythonhosted.org/packages/f2/a0/95ced4e7b1267fe1e2720a111685bcddf0e781f7e9e0ce59d751c44dcfe5/librt-0.8.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:153188fe98a72f206042be10a2c6026139852805215ed9539186312d50a8e972", size = 217751, upload-time = "2026-02-17T16:11:50.49Z" }, + { url = "https://files.pythonhosted.org/packages/93/c2/0517281cb4d4101c27ab59472924e67f55e375bc46bedae94ac6dc6e1902/librt-0.8.1-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:dd3c41254ee98604b08bd5b3af5bf0a89740d4ee0711de95b65166bf44091921", size = 218378, upload-time = "2026-02-17T16:11:51.783Z" }, + { url = "https://files.pythonhosted.org/packages/43/e8/37b3ac108e8976888e559a7b227d0ceac03c384cfd3e7a1c2ee248dbae79/librt-0.8.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e0d138c7ae532908cbb342162b2611dbd4d90c941cd25ab82084aaf71d2c0bd0", size = 241199, upload-time = "2026-02-17T16:11:53.561Z" }, + { url = "https://files.pythonhosted.org/packages/4b/5b/35812d041c53967fedf551a39399271bbe4257e681236a2cf1a69c8e7fa1/librt-0.8.1-cp312-cp312-win32.whl", hash = "sha256:43353b943613c5d9c49a25aaffdba46f888ec354e71e3529a00cca3f04d66a7a", size = 54917, upload-time = "2026-02-17T16:11:54.758Z" }, + { url = "https://files.pythonhosted.org/packages/de/d1/fa5d5331b862b9775aaf2a100f5ef86854e5d4407f71bddf102f4421e034/librt-0.8.1-cp312-cp312-win_amd64.whl", hash = "sha256:ff8baf1f8d3f4b6b7257fcb75a501f2a5499d0dda57645baa09d4d0d34b19444", size = 62017, upload-time = "2026-02-17T16:11:55.748Z" }, + { url = "https://files.pythonhosted.org/packages/c7/7c/c614252f9acda59b01a66e2ddfd243ed1c7e1deab0293332dfbccf862808/librt-0.8.1-cp312-cp312-win_arm64.whl", hash = "sha256:0f2ae3725904f7377e11cc37722d5d401e8b3d5851fb9273d7f4fe04f6b3d37d", size = 52441, upload-time = "2026-02-17T16:11:56.801Z" }, + { url = "https://files.pythonhosted.org/packages/c5/3c/f614c8e4eaac7cbf2bbdf9528790b21d89e277ee20d57dc6e559c626105f/librt-0.8.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:7e6bad1cd94f6764e1e21950542f818a09316645337fd5ab9a7acc45d99a8f35", size = 66529, upload-time = "2026-02-17T16:11:57.809Z" }, + { url = "https://files.pythonhosted.org/packages/ab/96/5836544a45100ae411eda07d29e3d99448e5258b6e9c8059deb92945f5c2/librt-0.8.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:cf450f498c30af55551ba4f66b9123b7185362ec8b625a773b3d39aa1a717583", size = 68669, upload-time = "2026-02-17T16:11:58.843Z" }, + { url = "https://files.pythonhosted.org/packages/06/53/f0b992b57af6d5531bf4677d75c44f095f2366a1741fb695ee462ae04b05/librt-0.8.1-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:eca45e982fa074090057132e30585a7e8674e9e885d402eae85633e9f449ce6c", size = 199279, upload-time = "2026-02-17T16:11:59.862Z" }, + { url = "https://files.pythonhosted.org/packages/f3/ad/4848cc16e268d14280d8168aee4f31cea92bbd2b79ce33d3e166f2b4e4fc/librt-0.8.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0c3811485fccfda840861905b8c70bba5ec094e02825598bb9d4ca3936857a04", size = 210288, upload-time = "2026-02-17T16:12:00.954Z" }, + { url = "https://files.pythonhosted.org/packages/52/05/27fdc2e95de26273d83b96742d8d3b7345f2ea2bdbd2405cc504644f2096/librt-0.8.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5e4af413908f77294605e28cfd98063f54b2c790561383971d2f52d113d9c363", size = 224809, upload-time = "2026-02-17T16:12:02.108Z" }, + { url = "https://files.pythonhosted.org/packages/7a/d0/78200a45ba3240cb042bc597d6f2accba9193a2c57d0356268cbbe2d0925/librt-0.8.1-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:5212a5bd7fae98dae95710032902edcd2ec4dc994e883294f75c857b83f9aba0", size = 218075, upload-time = "2026-02-17T16:12:03.631Z" }, + { url = "https://files.pythonhosted.org/packages/af/72/a210839fa74c90474897124c064ffca07f8d4b347b6574d309686aae7ca6/librt-0.8.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e692aa2d1d604e6ca12d35e51fdc36f4cda6345e28e36374579f7ef3611b3012", size = 225486, upload-time = "2026-02-17T16:12:04.725Z" }, + { url = "https://files.pythonhosted.org/packages/a3/c1/a03cc63722339ddbf087485f253493e2b013039f5b707e8e6016141130fa/librt-0.8.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:4be2a5c926b9770c9e08e717f05737a269b9d0ebc5d2f0060f0fe3fe9ce47acb", size = 218219, upload-time = "2026-02-17T16:12:05.828Z" }, + { url = "https://files.pythonhosted.org/packages/58/f5/fff6108af0acf941c6f274a946aea0e484bd10cd2dc37610287ce49388c5/librt-0.8.1-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:fd1a720332ea335ceb544cf0a03f81df92abd4bb887679fd1e460976b0e6214b", size = 218750, upload-time = "2026-02-17T16:12:07.09Z" }, + { url = "https://files.pythonhosted.org/packages/71/67/5a387bfef30ec1e4b4f30562c8586566faf87e47d696768c19feb49e3646/librt-0.8.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:93c2af9e01e0ef80d95ae3c720be101227edae5f2fe7e3dc63d8857fadfc5a1d", size = 241624, upload-time = "2026-02-17T16:12:08.43Z" }, + { url = "https://files.pythonhosted.org/packages/d4/be/24f8502db11d405232ac1162eb98069ca49c3306c1d75c6ccc61d9af8789/librt-0.8.1-cp313-cp313-win32.whl", hash = "sha256:086a32dbb71336627e78cc1d6ee305a68d038ef7d4c39aaff41ae8c9aa46e91a", size = 54969, upload-time = "2026-02-17T16:12:09.633Z" }, + { url = "https://files.pythonhosted.org/packages/5c/73/c9fdf6cb2a529c1a092ce769a12d88c8cca991194dfe641b6af12fa964d2/librt-0.8.1-cp313-cp313-win_amd64.whl", hash = "sha256:e11769a1dbda4da7b00a76cfffa67aa47cfa66921d2724539eee4b9ede780b79", size = 62000, upload-time = "2026-02-17T16:12:10.632Z" }, + { url = "https://files.pythonhosted.org/packages/d3/97/68f80ca3ac4924f250cdfa6e20142a803e5e50fca96ef5148c52ee8c10ea/librt-0.8.1-cp313-cp313-win_arm64.whl", hash = "sha256:924817ab3141aca17893386ee13261f1d100d1ef410d70afe4389f2359fea4f0", size = 52495, upload-time = "2026-02-17T16:12:11.633Z" }, + { url = "https://files.pythonhosted.org/packages/c9/6a/907ef6800f7bca71b525a05f1839b21f708c09043b1c6aa77b6b827b3996/librt-0.8.1-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:6cfa7fe54fd4d1f47130017351a959fe5804bda7a0bc7e07a2cdbc3fdd28d34f", size = 66081, upload-time = "2026-02-17T16:12:12.766Z" }, + { url = "https://files.pythonhosted.org/packages/1b/18/25e991cd5640c9fb0f8d91b18797b29066b792f17bf8493da183bf5caabe/librt-0.8.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:228c2409c079f8c11fb2e5d7b277077f694cb93443eb760e00b3b83cb8b3176c", size = 68309, upload-time = "2026-02-17T16:12:13.756Z" }, + { url = "https://files.pythonhosted.org/packages/a4/36/46820d03f058cfb5a9de5940640ba03165ed8aded69e0733c417bb04df34/librt-0.8.1-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:7aae78ab5e3206181780e56912d1b9bb9f90a7249ce12f0e8bf531d0462dd0fc", size = 196804, upload-time = "2026-02-17T16:12:14.818Z" }, + { url = "https://files.pythonhosted.org/packages/59/18/5dd0d3b87b8ff9c061849fbdb347758d1f724b9a82241aa908e0ec54ccd0/librt-0.8.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:172d57ec04346b047ca6af181e1ea4858086c80bdf455f61994c4aa6fc3f866c", size = 206907, upload-time = "2026-02-17T16:12:16.513Z" }, + { url = "https://files.pythonhosted.org/packages/d1/96/ef04902aad1424fd7299b62d1890e803e6ab4018c3044dca5922319c4b97/librt-0.8.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6b1977c4ea97ce5eb7755a78fae68d87e4102e4aaf54985e8b56806849cc06a3", size = 221217, upload-time = "2026-02-17T16:12:17.906Z" }, + { url = "https://files.pythonhosted.org/packages/6d/ff/7e01f2dda84a8f5d280637a2e5827210a8acca9a567a54507ef1c75b342d/librt-0.8.1-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:10c42e1f6fd06733ef65ae7bebce2872bcafd8d6e6b0a08fe0a05a23b044fb14", size = 214622, upload-time = "2026-02-17T16:12:19.108Z" }, + { url = "https://files.pythonhosted.org/packages/1e/8c/5b093d08a13946034fed57619742f790faf77058558b14ca36a6e331161e/librt-0.8.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:4c8dfa264b9193c4ee19113c985c95f876fae5e51f731494fc4e0cf594990ba7", size = 221987, upload-time = "2026-02-17T16:12:20.331Z" }, + { url = "https://files.pythonhosted.org/packages/d3/cc/86b0b3b151d40920ad45a94ce0171dec1aebba8a9d72bb3fa00c73ab25dd/librt-0.8.1-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:01170b6729a438f0dedc4a26ed342e3dc4f02d1000b4b19f980e1877f0c297e6", size = 215132, upload-time = "2026-02-17T16:12:21.54Z" }, + { url = "https://files.pythonhosted.org/packages/fc/be/8588164a46edf1e69858d952654e216a9a91174688eeefb9efbb38a9c799/librt-0.8.1-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:7b02679a0d783bdae30d443025b94465d8c3dc512f32f5b5031f93f57ac32071", size = 215195, upload-time = "2026-02-17T16:12:23.073Z" }, + { url = "https://files.pythonhosted.org/packages/f5/f2/0b9279bea735c734d69344ecfe056c1ba211694a72df10f568745c899c76/librt-0.8.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:190b109bb69592a3401fe1ffdea41a2e73370ace2ffdc4a0e8e2b39cdea81b78", size = 237946, upload-time = "2026-02-17T16:12:24.275Z" }, + { url = "https://files.pythonhosted.org/packages/e9/cc/5f2a34fbc8aeb35314a3641f9956fa9051a947424652fad9882be7a97949/librt-0.8.1-cp314-cp314-win32.whl", hash = "sha256:e70a57ecf89a0f64c24e37f38d3fe217a58169d2fe6ed6d70554964042474023", size = 50689, upload-time = "2026-02-17T16:12:25.766Z" }, + { url = "https://files.pythonhosted.org/packages/a0/76/cd4d010ab2147339ca2b93e959c3686e964edc6de66ddacc935c325883d7/librt-0.8.1-cp314-cp314-win_amd64.whl", hash = "sha256:7e2f3edca35664499fbb36e4770650c4bd4a08abc1f4458eab9df4ec56389730", size = 57875, upload-time = "2026-02-17T16:12:27.465Z" }, + { url = "https://files.pythonhosted.org/packages/84/0f/2143cb3c3ca48bd3379dcd11817163ca50781927c4537345d608b5045998/librt-0.8.1-cp314-cp314-win_arm64.whl", hash = "sha256:0d2f82168e55ddefd27c01c654ce52379c0750ddc31ee86b4b266bcf4d65f2a3", size = 48058, upload-time = "2026-02-17T16:12:28.556Z" }, + { url = "https://files.pythonhosted.org/packages/d2/0e/9b23a87e37baf00311c3efe6b48d6b6c168c29902dfc3f04c338372fd7db/librt-0.8.1-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2c74a2da57a094bd48d03fa5d196da83d2815678385d2978657499063709abe1", size = 68313, upload-time = "2026-02-17T16:12:29.659Z" }, + { url = "https://files.pythonhosted.org/packages/db/9a/859c41e5a4f1c84200a7d2b92f586aa27133c8243b6cac9926f6e54d01b9/librt-0.8.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:a355d99c4c0d8e5b770313b8b247411ed40949ca44e33e46a4789b9293a907ee", size = 70994, upload-time = "2026-02-17T16:12:31.516Z" }, + { url = "https://files.pythonhosted.org/packages/4c/28/10605366ee599ed34223ac2bf66404c6fb59399f47108215d16d5ad751a8/librt-0.8.1-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:2eb345e8b33fb748227409c9f1233d4df354d6e54091f0e8fc53acdb2ffedeb7", size = 220770, upload-time = "2026-02-17T16:12:33.294Z" }, + { url = "https://files.pythonhosted.org/packages/af/8d/16ed8fd452dafae9c48d17a6bc1ee3e818fd40ef718d149a8eff2c9f4ea2/librt-0.8.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9be2f15e53ce4e83cc08adc29b26fb5978db62ef2a366fbdf716c8a6c8901040", size = 235409, upload-time = "2026-02-17T16:12:35.443Z" }, + { url = "https://files.pythonhosted.org/packages/89/1b/7bdf3e49349c134b25db816e4a3db6b94a47ac69d7d46b1e682c2c4949be/librt-0.8.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:785ae29c1f5c6e7c2cde2c7c0e148147f4503da3abc5d44d482068da5322fd9e", size = 246473, upload-time = "2026-02-17T16:12:36.656Z" }, + { url = "https://files.pythonhosted.org/packages/4e/8a/91fab8e4fd2a24930a17188c7af5380eb27b203d72101c9cc000dbdfd95a/librt-0.8.1-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:1d3a7da44baf692f0c6aeb5b2a09c5e6fc7a703bca9ffa337ddd2e2da53f7732", size = 238866, upload-time = "2026-02-17T16:12:37.849Z" }, + { url = "https://files.pythonhosted.org/packages/b9/e0/c45a098843fc7c07e18a7f8a24ca8496aecbf7bdcd54980c6ca1aaa79a8e/librt-0.8.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:5fc48998000cbc39ec0d5311312dda93ecf92b39aaf184c5e817d5d440b29624", size = 250248, upload-time = "2026-02-17T16:12:39.445Z" }, + { url = "https://files.pythonhosted.org/packages/82/30/07627de23036640c952cce0c1fe78972e77d7d2f8fd54fa5ef4554ff4a56/librt-0.8.1-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:e96baa6820280077a78244b2e06e416480ed859bbd8e5d641cf5742919d8beb4", size = 240629, upload-time = "2026-02-17T16:12:40.889Z" }, + { url = "https://files.pythonhosted.org/packages/fb/c1/55bfe1ee3542eba055616f9098eaf6eddb966efb0ca0f44eaa4aba327307/librt-0.8.1-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:31362dbfe297b23590530007062c32c6f6176f6099646bb2c95ab1b00a57c382", size = 239615, upload-time = "2026-02-17T16:12:42.446Z" }, + { url = "https://files.pythonhosted.org/packages/2b/39/191d3d28abc26c9099b19852e6c99f7f6d400b82fa5a4e80291bd3803e19/librt-0.8.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:cc3656283d11540ab0ea01978378e73e10002145117055e03722417aeab30994", size = 263001, upload-time = "2026-02-17T16:12:43.627Z" }, + { url = "https://files.pythonhosted.org/packages/b9/eb/7697f60fbe7042ab4e88f4ee6af496b7f222fffb0a4e3593ef1f29f81652/librt-0.8.1-cp314-cp314t-win32.whl", hash = "sha256:738f08021b3142c2918c03692608baed43bc51144c29e35807682f8070ee2a3a", size = 51328, upload-time = "2026-02-17T16:12:45.148Z" }, + { url = "https://files.pythonhosted.org/packages/7c/72/34bf2eb7a15414a23e5e70ecb9440c1d3179f393d9349338a91e2781c0fb/librt-0.8.1-cp314-cp314t-win_amd64.whl", hash = "sha256:89815a22daf9c51884fb5dbe4f1ef65ee6a146e0b6a8df05f753e2e4a9359bf4", size = 58722, upload-time = "2026-02-17T16:12:46.85Z" }, + { url = "https://files.pythonhosted.org/packages/b2/c8/d148e041732d631fc76036f8b30fae4e77b027a1e95b7a84bb522481a940/librt-0.8.1-cp314-cp314t-win_arm64.whl", hash = "sha256:bf512a71a23504ed08103a13c941f763db13fb11177beb3d9244c98c29fb4a61", size = 48755, upload-time = "2026-02-17T16:12:47.943Z" }, +] + [[package]] name = "logfire-api" version = "4.31.0" @@ -679,6 +764,73 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5b/54/662a4743aa81d9582ee9339d4ffa3c8fd40a4965e033d77b9da9774d3960/mkdocs_material_extensions-1.3.1-py3-none-any.whl", hash = "sha256:adff8b62700b25cb77b53358dad940f3ef973dd6db797907c49e3c2ef3ab4e31", size = 8728, upload-time = "2023-11-22T19:09:43.465Z" }, ] +[[package]] +name = "mypy" +version = "1.20.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "librt", marker = "platform_python_implementation != 'PyPy'" }, + { name = "mypy-extensions" }, + { name = "pathspec" }, + { name = "tomli", marker = "python_full_version < '3.11'" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f8/5c/b0089fe7fef0a994ae5ee07029ced0526082c6cfaaa4c10d40a10e33b097/mypy-1.20.0.tar.gz", hash = "sha256:eb96c84efcc33f0b5e0e04beacf00129dd963b67226b01c00b9dfc8affb464c3", size = 3815028, upload-time = "2026-03-31T16:55:14.959Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4d/a2/a965c8c3fcd4fa8b84ba0d46606181b0d0a1d50f274c67877f3e9ed4882c/mypy-1.20.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d99f515f95fd03a90875fdb2cca12ff074aa04490db4d190905851bdf8a549a8", size = 14430138, upload-time = "2026-03-31T16:52:37.843Z" }, + { url = "https://files.pythonhosted.org/packages/53/6e/043477501deeb8eabbab7f1a2f6cac62cfb631806dc1d6862a04a7f5011b/mypy-1.20.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:bd0212976dc57a5bfeede7c219e7cd66568a32c05c9129686dd487c059c1b88a", size = 13311282, upload-time = "2026-03-31T16:55:11.021Z" }, + { url = "https://files.pythonhosted.org/packages/65/aa/bd89b247b83128197a214f29f0632ff3c14f54d4cd70d144d157bd7d7d6e/mypy-1.20.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f8426d4d75d68714abc17a4292d922f6ba2cfb984b72c2278c437f6dae797865", size = 13750889, upload-time = "2026-03-31T16:52:02.909Z" }, + { url = "https://files.pythonhosted.org/packages/fa/9d/2860be7355c45247ccc0be1501c91176318964c2a137bd4743f58ce6200e/mypy-1.20.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:02cca0761c75b42a20a2757ae58713276605eb29a08dd8a6e092aa347c4115ca", size = 14619788, upload-time = "2026-03-31T16:50:48.928Z" }, + { url = "https://files.pythonhosted.org/packages/75/7f/3ef3e360c91f3de120f205c8ce405e9caf9fc52ef14b65d37073e322c114/mypy-1.20.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b3a49064504be59e59da664c5e149edc1f26c67c4f8e8456f6ba6aba55033018", size = 14918849, upload-time = "2026-03-31T16:51:10.478Z" }, + { url = "https://files.pythonhosted.org/packages/ae/72/af970dfe167ef788df7c5e6109d2ed0229f164432ce828bc9741a4250e64/mypy-1.20.0-cp310-cp310-win_amd64.whl", hash = "sha256:ebea00201737ad4391142808ed16e875add5c17f676e0912b387739f84991e13", size = 10822007, upload-time = "2026-03-31T16:50:25.268Z" }, + { url = "https://files.pythonhosted.org/packages/93/94/ba9065c2ebe5421619aff684b793d953e438a8bfe31a320dd6d1e0706e81/mypy-1.20.0-cp310-cp310-win_arm64.whl", hash = "sha256:e80cf77847d0d3e6e3111b7b25db32a7f8762fd4b9a3a72ce53fe16a2863b281", size = 9756158, upload-time = "2026-03-31T16:48:36.213Z" }, + { url = "https://files.pythonhosted.org/packages/6e/1c/74cb1d9993236910286865679d1c616b136b2eae468493aa939431eda410/mypy-1.20.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4525e7010b1b38334516181c5b81e16180b8e149e6684cee5a727c78186b4e3b", size = 14343972, upload-time = "2026-03-31T16:49:04.887Z" }, + { url = "https://files.pythonhosted.org/packages/d5/0d/01399515eca280386e308cf57901e68d3a52af18691941b773b3380c1df8/mypy-1.20.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a17c5d0bdcca61ce24a35beb828a2d0d323d3fcf387d7512206888c900193367", size = 13225007, upload-time = "2026-03-31T16:50:08.151Z" }, + { url = "https://files.pythonhosted.org/packages/56/ac/b4ba5094fb2d7fe9d2037cd8d18bbe02bcf68fd22ab9ff013f55e57ba095/mypy-1.20.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f75ff57defcd0f1d6e006d721ccdec6c88d4f6a7816eb92f1c4890d979d9ee62", size = 13663752, upload-time = "2026-03-31T16:49:26.064Z" }, + { url = "https://files.pythonhosted.org/packages/db/a7/460678d3cf7da252d2288dad0c602294b6ec22a91932ec368cc11e44bb6e/mypy-1.20.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b503ab55a836136b619b5fc21c8803d810c5b87551af8600b72eecafb0059cb0", size = 14532265, upload-time = "2026-03-31T16:53:55.077Z" }, + { url = "https://files.pythonhosted.org/packages/a3/3e/051cca8166cf0438ae3ea80e0e7c030d7a8ab98dffc93f80a1aa3f23c1a2/mypy-1.20.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1973868d2adbb4584a3835780b27436f06d1dc606af5be09f187aaa25be1070f", size = 14768476, upload-time = "2026-03-31T16:50:34.587Z" }, + { url = "https://files.pythonhosted.org/packages/be/66/8e02ec184f852ed5c4abb805583305db475930854e09964b55e107cdcbc4/mypy-1.20.0-cp311-cp311-win_amd64.whl", hash = "sha256:2fcedb16d456106e545b2bfd7ef9d24e70b38ec252d2a629823a4d07ebcdb69e", size = 10818226, upload-time = "2026-03-31T16:53:15.624Z" }, + { url = "https://files.pythonhosted.org/packages/13/4b/383ad1924b28f41e4879a74151e7a5451123330d45652da359f9183bcd45/mypy-1.20.0-cp311-cp311-win_arm64.whl", hash = "sha256:379edf079ce44ac8d2805bcf9b3dd7340d4f97aad3a5e0ebabbf9d125b84b442", size = 9750091, upload-time = "2026-03-31T16:54:12.162Z" }, + { url = "https://files.pythonhosted.org/packages/be/dd/3afa29b58c2e57c79116ed55d700721c3c3b15955e2b6251dd165d377c0e/mypy-1.20.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:002b613ae19f4ac7d18b7e168ffe1cb9013b37c57f7411984abbd3b817b0a214", size = 14509525, upload-time = "2026-03-31T16:55:01.824Z" }, + { url = "https://files.pythonhosted.org/packages/54/eb/227b516ab8cad9f2a13c5e7a98d28cd6aa75e9c83e82776ae6c1c4c046c7/mypy-1.20.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a9336b5e6712f4adaf5afc3203a99a40b379049104349d747eb3e5a3aa23ac2e", size = 13326469, upload-time = "2026-03-31T16:51:41.23Z" }, + { url = "https://files.pythonhosted.org/packages/57/d4/1ddb799860c1b5ac6117ec307b965f65deeb47044395ff01ab793248a591/mypy-1.20.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f13b3e41bce9d257eded794c0f12878af3129d80aacd8a3ee0dee51f3a978651", size = 13705953, upload-time = "2026-03-31T16:48:55.69Z" }, + { url = "https://files.pythonhosted.org/packages/c5/b7/54a720f565a87b893182a2a393370289ae7149e4715859e10e1c05e49154/mypy-1.20.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9804c3ad27f78e54e58b32e7cb532d128b43dbfb9f3f9f06262b821a0f6bd3f5", size = 14710363, upload-time = "2026-03-31T16:53:26.948Z" }, + { url = "https://files.pythonhosted.org/packages/b2/2a/74810274848d061f8a8ea4ac23aaad43bd3d8c1882457999c2e568341c57/mypy-1.20.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:697f102c5c1d526bdd761a69f17c6070f9892eebcb94b1a5963d679288c09e78", size = 14947005, upload-time = "2026-03-31T16:50:17.591Z" }, + { url = "https://files.pythonhosted.org/packages/77/91/21b8ba75f958bcda75690951ce6fa6b7138b03471618959529d74b8544e2/mypy-1.20.0-cp312-cp312-win_amd64.whl", hash = "sha256:0ecd63f75fdd30327e4ad8b5704bd6d91fc6c1b2e029f8ee14705e1207212489", size = 10880616, upload-time = "2026-03-31T16:52:19.986Z" }, + { url = "https://files.pythonhosted.org/packages/8a/15/3d8198ef97c1ca03aea010cce4f1d4f3bc5d9849e8c0140111ca2ead9fdd/mypy-1.20.0-cp312-cp312-win_arm64.whl", hash = "sha256:f194db59657c58593a3c47c6dfd7bad4ef4ac12dbc94d01b3a95521f78177e33", size = 9813091, upload-time = "2026-03-31T16:53:44.385Z" }, + { url = "https://files.pythonhosted.org/packages/d6/a7/f64ea7bd592fa431cb597418b6dec4a47f7d0c36325fec7ac67bc8402b94/mypy-1.20.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b20c8b0fd5877abdf402e79a3af987053de07e6fb208c18df6659f708b535134", size = 14485344, upload-time = "2026-03-31T16:49:16.78Z" }, + { url = "https://files.pythonhosted.org/packages/bb/72/8927d84cfc90c6abea6e96663576e2e417589347eb538749a464c4c218a0/mypy-1.20.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:367e5c993ba34d5054d11937d0485ad6dfc60ba760fa326c01090fc256adf15c", size = 13327400, upload-time = "2026-03-31T16:53:08.02Z" }, + { url = "https://files.pythonhosted.org/packages/ab/4a/11ab99f9afa41aa350178d24a7d2da17043228ea10f6456523f64b5a6cf6/mypy-1.20.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f799d9db89fc00446f03281f84a221e50018fc40113a3ba9864b132895619ebe", size = 13706384, upload-time = "2026-03-31T16:52:28.577Z" }, + { url = "https://files.pythonhosted.org/packages/42/79/694ca73979cfb3535ebfe78733844cd5aff2e63304f59bf90585110d975a/mypy-1.20.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:555658c611099455b2da507582ea20d2043dfdfe7f5ad0add472b1c6238b433f", size = 14700378, upload-time = "2026-03-31T16:48:45.527Z" }, + { url = "https://files.pythonhosted.org/packages/84/24/a022ccab3a46e3d2cdf2e0e260648633640eb396c7e75d5a42818a8d3971/mypy-1.20.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:efe8d70949c3023698c3fca1e94527e7e790a361ab8116f90d11221421cd8726", size = 14932170, upload-time = "2026-03-31T16:49:36.038Z" }, + { url = "https://files.pythonhosted.org/packages/d8/9b/549228d88f574d04117e736f55958bd4908f980f9f5700a07aeb85df005b/mypy-1.20.0-cp313-cp313-win_amd64.whl", hash = "sha256:f49590891d2c2f8a9de15614e32e459a794bcba84693c2394291a2038bbaaa69", size = 10888526, upload-time = "2026-03-31T16:50:59.827Z" }, + { url = "https://files.pythonhosted.org/packages/91/17/15095c0e54a8bc04d22d4ff06b2139d5f142c2e87520b4e39010c4862771/mypy-1.20.0-cp313-cp313-win_arm64.whl", hash = "sha256:76a70bf840495729be47510856b978f1b0ec7d08f257ca38c9d932720bf6b43e", size = 9816456, upload-time = "2026-03-31T16:49:59.537Z" }, + { url = "https://files.pythonhosted.org/packages/4e/0e/6ca4a84cbed9e62384bc0b2974c90395ece5ed672393e553996501625fc5/mypy-1.20.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:0f42dfaab7ec1baff3b383ad7af562ab0de573c5f6edb44b2dab016082b89948", size = 14483331, upload-time = "2026-03-31T16:52:57.999Z" }, + { url = "https://files.pythonhosted.org/packages/7d/c5/5fe9d8a729dd9605064691816243ae6c49fde0bd28f6e5e17f6a24203c43/mypy-1.20.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:31b5dbb55293c1bd27c0fc813a0d2bb5ceef9d65ac5afa2e58f829dab7921fd5", size = 13342047, upload-time = "2026-03-31T16:54:21.555Z" }, + { url = "https://files.pythonhosted.org/packages/4c/33/e18bcfa338ca4e6b2771c85d4c5203e627d0c69d9de5c1a2cf2ba13320ba/mypy-1.20.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:49d11c6f573a5a08f77fad13faff2139f6d0730ebed2cfa9b3d2702671dd7188", size = 13719585, upload-time = "2026-03-31T16:51:53.89Z" }, + { url = "https://files.pythonhosted.org/packages/6b/8d/93491ff7b79419edc7eabf95cb3b3f7490e2e574b2855c7c7e7394ff933f/mypy-1.20.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7d3243c406773185144527f83be0e0aefc7bf4601b0b2b956665608bf7c98a83", size = 14685075, upload-time = "2026-03-31T16:54:04.464Z" }, + { url = "https://files.pythonhosted.org/packages/b5/9d/d924b38a4923f8d164bf2b4ec98bf13beaf6e10a5348b4b137eadae40a6e/mypy-1.20.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:a79c1eba7ac4209f2d850f0edd0a2f8bba88cbfdfefe6fb76a19e9d4fe5e71a2", size = 14919141, upload-time = "2026-03-31T16:54:51.785Z" }, + { url = "https://files.pythonhosted.org/packages/59/98/1da9977016678c0b99d43afe52ed00bb3c1a0c4c995d3e6acca1a6ebb9b4/mypy-1.20.0-cp314-cp314-win_amd64.whl", hash = "sha256:00e047c74d3ec6e71a2eb88e9ea551a2edb90c21f993aefa9e0d2a898e0bb732", size = 11050925, upload-time = "2026-03-31T16:51:30.758Z" }, + { url = "https://files.pythonhosted.org/packages/5e/e3/ba0b7a3143e49a9c4f5967dde6ea4bf8e0b10ecbbcca69af84027160ee89/mypy-1.20.0-cp314-cp314-win_arm64.whl", hash = "sha256:931a7630bba591593dcf6e97224a21ff80fb357e7982628d25e3c618e7f598ef", size = 10001089, upload-time = "2026-03-31T16:49:43.632Z" }, + { url = "https://files.pythonhosted.org/packages/12/28/e617e67b3be9d213cda7277913269c874eb26472489f95d09d89765ce2d8/mypy-1.20.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:26c8b52627b6552f47ff11adb4e1509605f094e29815323e487fc0053ebe93d1", size = 15534710, upload-time = "2026-03-31T16:52:12.506Z" }, + { url = "https://files.pythonhosted.org/packages/6e/0c/3b5f2d3e45dc7169b811adce8451679d9430399d03b168f9b0489f43adaa/mypy-1.20.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:39362cdb4ba5f916e7976fccecaab1ba3a83e35f60fa68b64e9a70e221bb2436", size = 14393013, upload-time = "2026-03-31T16:54:41.186Z" }, + { url = "https://files.pythonhosted.org/packages/a3/49/edc8b0aa145cc09c1c74f7ce2858eead9329931dcbbb26e2ad40906daa4e/mypy-1.20.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:34506397dbf40c15dc567635d18a21d33827e9ab29014fb83d292a8f4f8953b6", size = 15047240, upload-time = "2026-03-31T16:54:31.955Z" }, + { url = "https://files.pythonhosted.org/packages/42/37/a946bb416e37a57fa752b3100fd5ede0e28df94f92366d1716555d47c454/mypy-1.20.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:555493c44a4f5a1b58d611a43333e71a9981c6dbe26270377b6f8174126a0526", size = 15858565, upload-time = "2026-03-31T16:53:36.997Z" }, + { url = "https://files.pythonhosted.org/packages/2f/99/7690b5b5b552db1bd4ff362e4c0eb3107b98d680835e65823fbe888c8b78/mypy-1.20.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:2721f0ce49cb74a38f00c50da67cb7d36317b5eda38877a49614dc018e91c787", size = 16087874, upload-time = "2026-03-31T16:52:48.313Z" }, + { url = "https://files.pythonhosted.org/packages/aa/76/53e893a498138066acd28192b77495c9357e5a58cc4be753182846b43315/mypy-1.20.0-cp314-cp314t-win_amd64.whl", hash = "sha256:47781555a7aa5fedcc2d16bcd72e0dc83eb272c10dd657f9fb3f9cc08e2e6abb", size = 12572380, upload-time = "2026-03-31T16:49:52.454Z" }, + { url = "https://files.pythonhosted.org/packages/76/9c/6dbdae21f01b7aacddc2c0bbf3c5557aa547827fdf271770fe1e521e7093/mypy-1.20.0-cp314-cp314t-win_arm64.whl", hash = "sha256:c70380fe5d64010f79fb863b9081c7004dd65225d2277333c219d93a10dad4dd", size = 10381174, upload-time = "2026-03-31T16:51:20.179Z" }, + { url = "https://files.pythonhosted.org/packages/21/66/4d734961ce167f0fd8380769b3b7c06dbdd6ff54c2190f3f2ecd22528158/mypy-1.20.0-py3-none-any.whl", hash = "sha256:a6e0641147cbfa7e4e94efdb95c2dab1aff8cfc159ded13e07f308ddccc8c48e", size = 2636365, upload-time = "2026-03-31T16:51:44.911Z" }, +] + +[[package]] +name = "mypy-extensions" +version = "1.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a2/6e/371856a3fb9d31ca8dac321cda606860fa4548858c0cc45d9d1d4ca2628b/mypy_extensions-1.1.0.tar.gz", hash = "sha256:52e68efc3284861e772bbcd66823fde5ae21fd2fdb51c62a211403730b916558", size = 6343, upload-time = "2025-04-22T14:54:24.164Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/79/7b/2c79738432f5c924bef5071f933bcc9efd0473bac3b4aa584a6f7c1c8df8/mypy_extensions-1.1.0-py3-none-any.whl", hash = "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505", size = 4963, upload-time = "2025-04-22T14:54:22.983Z" }, +] + [[package]] name = "nodeenv" version = "1.9.1" @@ -721,11 +873,11 @@ wheels = [ [[package]] name = "pathspec" -version = "0.12.1" +version = "1.0.4" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ca/bc/f35b8446f4531a7cb215605d100cd88b7ac6f44ab3fc94870c120ab3adbf/pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712", size = 51043, upload-time = "2023-12-10T22:30:45Z" } +sdist = { url = "https://files.pythonhosted.org/packages/fa/36/e27608899f9b8d4dff0617b2d9ab17ca5608956ca44461ac14ac48b44015/pathspec-1.0.4.tar.gz", hash = "sha256:0210e2ae8a21a9137c0d470578cb0e595af87edaa6ebf12ff176f14a02e0e645", size = 131200, upload-time = "2026-01-27T03:59:46.938Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08", size = 31191, upload-time = "2023-12-10T22:30:43.14Z" }, + { url = "https://files.pythonhosted.org/packages/ef/3c/2c197d226f9ea224a9ab8d197933f9da0ae0aac5b6e0f884e2b8d9c8e9f7/pathspec-1.0.4-py3-none-any.whl", hash = "sha256:fb6ae2fd4e7c921a165808a552060e722767cfa526f99ca5156ed2ce45a5c723", size = 55206, upload-time = "2026-01-27T03:59:45.137Z" }, ] [[package]] @@ -785,6 +937,7 @@ web = [ dev = [ { name = "jsonschema" }, { name = "mkdocs-material" }, + { name = "mypy" }, { name = "pre-commit" }, { name = "pytest" }, { name = "pytest-asyncio" }, @@ -811,6 +964,7 @@ provides-extras = ["rich", "web", "evals"] dev = [ { name = "jsonschema", specifier = ">=4.0.0" }, { name = "mkdocs-material", specifier = ">=9.7.0" }, + { name = "mypy", specifier = ">=1.0" }, { name = "pre-commit", specifier = ">=4.5.0" }, { name = "pytest", specifier = ">=9.0.1" }, { name = "pytest-asyncio", specifier = ">=1.3.0" }, From 752ddbca9e0909e6ec011d9a4037f860003410da Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Fri, 3 Apr 2026 05:10:29 +0200 Subject: [PATCH 20/60] refactor(evals): replace `session.eval` with `EvalSuite` for cleaner API - Introduced `EvalSuite` class to encapsulate eval logic, replacing inline `session.eval()` definitions. - Removed duplicate `eval` methods in `ProTestSession` and `ProTestSuite`. - Updated tests and examples to leverage `EvalSuite`. --- examples/yorkshire/evals/session.py | 9 +- protest/core/session.py | 81 +-------------- protest/core/suite.py | 23 ----- protest/evals/__init__.py | 13 --- protest/evals/session.py | 57 ++++++++++- protest/evals/suite.py | 87 +++++++++++++++++ tests/evals/test_e2e.py | 146 ++++++++++++++++++++++------ tests/evals/test_judge.py | 31 ++++-- 8 files changed, 284 insertions(+), 163 deletions(-) create mode 100644 protest/evals/suite.py diff --git a/examples/yorkshire/evals/session.py b/examples/yorkshire/evals/session.py index 7779f66..f1800d8 100644 --- a/examples/yorkshire/evals/session.py +++ b/examples/yorkshire/evals/session.py @@ -16,14 +16,19 @@ yorkshire_cases, ) from protest import From -from protest.evals import EvalSession, ModelInfo +from protest.evals import ModelInfo +from protest.evals.session import EvalSession +from protest.evals.suite import EvalSuite session = EvalSession( model=ModelInfo(name="yorkshire-chatbot-v1", provider="local"), metadata={"version": "1.0", "type": "keyword-matching"}, ) +yorkshire_suite = EvalSuite("yorkshire_eval") +session.add_suite(yorkshire_suite) -@session.eval(evaluators=suite_evaluators) + +@yorkshire_suite.eval(evaluators=suite_evaluators) def yorkshire_eval(case: Annotated[dict, From(yorkshire_cases)]) -> str: return yorkshire_chatbot(case["inputs"]) diff --git a/protest/core/session.py b/protest/core/session.py index efef4fb..daafa74 100644 --- a/protest/core/session.py +++ b/protest/core/session.py @@ -8,13 +8,12 @@ from types import TracebackType from protest.compat import Self + from protest.core.suite import ProTestSuite from protest.entities import FixtureCallable - from protest.evals.types import JudgeInfo, ModelInfo from protest.plugin import PluginBase, PluginContext from protest.cache.plugin import CachePlugin from protest.cache.storage import CacheStorage -from protest.core.suite import ProTestSuite from protest.di.container import FixtureContainer from protest.di.decorators import get_fixture_marker, unwrap_fixture from protest.entities import ( @@ -22,16 +21,12 @@ FixtureScope, Retry, Skip, - SuiteKind, TestRegistration, Xfail, normalize_retry, normalize_skip, normalize_xfail, ) -from protest.evals.history import EvalHistoryPlugin -from protest.evals.results_writer import EvalResultsWriter -from protest.evals.wrapper import make_eval_wrapper from protest.events.bus import EventBus from protest.events.types import Event from protest.exceptions import InvalidMaxConcurrencyError @@ -88,9 +83,6 @@ def __init__( self._history = history self._history_dir = history_dir self._metadata: dict[str, Any] = dict(metadata) if metadata else {} - self._eval_model: ModelInfo | None = None # set by EvalSession - self._eval_judge: JudgeInfo | None = None # set by EvalSession - self._eval_judge_instance: Any = None # set by EvalSession async def resolve_autouse(self) -> None: """Resolve all session autouse fixtures at session start.""" @@ -207,51 +199,6 @@ def decorator(func: FuncT) -> FuncT: return decorator - def eval( - self, - evaluators: list[Any] | None = None, - expected_key: str = "expected", - tags: list[str] | None = None, - timeout: float | None = None, - name: str | None = None, - model: Any = None, - ) -> Callable[[FuncT], FuncT]: - """Register a scored eval test. - - Creates an implicit eval suite named after the function. - The decorated function's return value is passed to evaluators. - Use with ForEach/From for parametrization:: - - @session.eval(evaluators=[my_scorer], model=ModelInfo(name="qwen")) - async def my_eval(case: Annotated[dict, From(cases)]) -> str: - return await run(case["q"]) - """ - - def decorator(func: FuncT) -> FuncT: - suite_name = name or func.__name__ - suite_meta: dict[str, Any] = {} - resolved_model = model or self._eval_model - if resolved_model: - suite_meta["model"] = resolved_model.name - suite_meta["provider"] = resolved_model.provider - suite = ProTestSuite( - name=suite_name, - tags=list(tags or []), - kind=SuiteKind.EVAL, - metadata=suite_meta, - ) - wrapper = make_eval_wrapper( - func, - evaluators or [], - expected_key, - judge=self._eval_judge_instance, - ) - suite.test(tags=tags, timeout=timeout, is_eval=True)(wrapper) - self.add_suite(suite) - return func - - return decorator - def add_suite(self, suite: ProTestSuite) -> None: """Add a suite to this session.""" suite._attach_to_session(self) @@ -375,32 +322,6 @@ def activate_plugins(self, ctx: PluginContext) -> None: if instance is not None: self.register_plugin(instance) - # Auto-wire eval support if any suite has kind="eval" - if any(s.kind == SuiteKind.EVAL for s in self._suites): - self._wire_eval_support() - - def _wire_eval_support(self) -> None: - """Wire eval history + results writer plugins (no EvalPlugin).""" - - judge_dict = None - if self._eval_judge: - judge_dict = { - "name": self._eval_judge.name, - "provider": self._eval_judge.provider, - "evaluators": list(self._eval_judge.evaluators), - } - - history = EvalHistoryPlugin( - history_dir=self._history_dir, - model=self._eval_model, - judge=judge_dict, - metadata=self._metadata, - ) - self.register_plugin(history) - - writer = EvalResultsWriter(history_dir=self._history_dir) - self.register_plugin(writer) - async def __aenter__(self) -> Self: self._register_fixtures() await self._resolver.__aenter__() diff --git a/protest/core/suite.py b/protest/core/suite.py index 99b4fa2..b73e9f0 100644 --- a/protest/core/suite.py +++ b/protest/core/suite.py @@ -22,7 +22,6 @@ normalize_skip, normalize_xfail, ) -from protest.evals.wrapper import make_eval_wrapper from protest.exceptions import ConcurrencyMismatchError, InvalidMaxConcurrencyError FuncT = TypeVar("FuncT", bound="Callable[..., object]") @@ -161,28 +160,6 @@ def decorator(func: FuncT) -> FuncT: return decorator - def eval( - self, - evaluators: list[Any] | None = None, - expected_key: str = "expected", - tags: list[str] | None = None, - timeout: float | None = None, - judge: Any = None, - ) -> Callable[[FuncT], FuncT]: - """Register a scored eval test on this suite.""" - - def decorator(func: FuncT) -> FuncT: - wrapper = make_eval_wrapper( - func, - evaluators or [], - expected_key, - judge=judge, - ) - self.test(tags=tags, timeout=timeout, is_eval=True)(wrapper) - return func - - return decorator - def add_suite(self, suite: ProTestSuite) -> None: """Add a child suite. Child can access parent's fixtures.""" parent_effective = self.effective_max_concurrency diff --git a/protest/evals/__init__.py b/protest/evals/__init__.py index d90b8f4..8584eff 100644 --- a/protest/evals/__init__.py +++ b/protest/evals/__init__.py @@ -26,7 +26,6 @@ "EvalCaseResult", "EvalContext", "EvalScore", - "EvalSession", "EvalSuiteReport", "Judge", "JudgeInfo", @@ -40,15 +39,3 @@ "Verdict", "evaluator", ] - - -def __getattr__(name: str) -> object: - # EvalSession imports protest.core.session which imports reporters, - # and reporters import protest.evals.types — eagerly importing - # EvalSession here would create a circular import chain. - if name == "EvalSession": - from protest.evals.session import EvalSession # noqa: PLC0415 — circular import - - return EvalSession - msg = f"module {__name__!r} has no attribute {name!r}" - raise AttributeError(msg) diff --git a/protest/evals/session.py b/protest/evals/session.py index 09f0d5c..9ed7459 100644 --- a/protest/evals/session.py +++ b/protest/evals/session.py @@ -5,12 +5,18 @@ from typing import TYPE_CHECKING, Any from protest.core.session import ProTestSession +from protest.entities import SuiteKind +from protest.evals.history import EvalHistoryPlugin +from protest.evals.results_writer import EvalResultsWriter +from protest.evals.suite import EvalSuite from protest.evals.types import JudgeInfo if TYPE_CHECKING: from pathlib import Path + from protest.core.suite import ProTestSuite from protest.evals.types import Judge, ModelInfo + from protest.plugin import PluginContext class EvalSession(ProTestSession): @@ -20,7 +26,10 @@ class EvalSession(ProTestSession): session = EvalSession(model=ModelInfo(name="qwen-2.5")) - @session.eval(evaluators=[contains_facts]) + chatbot = EvalSuite("chatbot") + session.add_suite(chatbot) + + @chatbot.eval(evaluators=[contains_facts]) async def chatbot(case: Annotated[dict, From(cases)]) -> str: return await ask(case["q"]) """ @@ -43,7 +52,45 @@ def __init__( ) self._eval_model = model self._eval_judge_instance: Judge | None = judge - if judge is not None: - self._eval_judge = JudgeInfo(name=judge.name, provider=judge.provider) - else: - self._eval_judge = None + self._eval_judge: JudgeInfo | None = ( + JudgeInfo(name=judge.name, provider=judge.provider) + if judge is not None + else None + ) + + def add_suite(self, suite: ProTestSuite) -> None: + """Add a suite, propagating session-level model/judge as defaults.""" + if isinstance(suite, EvalSuite): + if suite.judge is None and self._eval_judge_instance is not None: + suite._judge = self._eval_judge_instance + if self._eval_model and "model" not in suite.suite_metadata: + suite._metadata["model"] = self._eval_model.name + suite._metadata["provider"] = self._eval_model.provider + super().add_suite(suite) + + def activate_plugins(self, ctx: PluginContext) -> None: + """Activate plugins, then wire eval support if needed.""" + super().activate_plugins(ctx) + if any(s.kind == SuiteKind.EVAL for s in self._suites): + self._wire_eval_support() + + def _wire_eval_support(self) -> None: + """Wire eval history + results writer plugins.""" + judge_dict = None + if self._eval_judge: + judge_dict = { + "name": self._eval_judge.name, + "provider": self._eval_judge.provider, + "evaluators": list(self._eval_judge.evaluators), + } + + history = EvalHistoryPlugin( + history_dir=self._history_dir, + model=self._eval_model, + judge=judge_dict, + metadata=self._metadata, + ) + self.register_plugin(history) + + writer = EvalResultsWriter(history_dir=self._history_dir) + self.register_plugin(writer) diff --git a/protest/evals/suite.py b/protest/evals/suite.py new file mode 100644 index 0000000..279aec7 --- /dev/null +++ b/protest/evals/suite.py @@ -0,0 +1,87 @@ +"""EvalSuite — suite dédiée aux evals.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, TypeVar + +from protest.core.suite import ProTestSuite +from protest.entities import SuiteKind +from protest.evals.wrapper import make_eval_wrapper + +if TYPE_CHECKING: + from collections.abc import Callable + + from protest.evals.types import Judge, ModelInfo + +FuncT = TypeVar("FuncT", bound="Callable[..., object]") + + +class EvalSuite(ProTestSuite): + """Suite dédiée aux evals. + + Usage:: + + chatbot = EvalSuite("chatbot") + session.add_suite(chatbot) + + @chatbot.eval(evaluators=[contains_facts]) + async def chatbot(case: Annotated[dict, From(cases)]) -> str: + return await ask(case["q"]) + """ + + def __init__( + self, + name: str, + *, + model: ModelInfo | None = None, + judge: Judge | None = None, + tags: list[str] | None = None, + max_concurrency: int | None = None, + description: str | None = None, + metadata: dict[str, Any] | None = None, + ) -> None: + suite_meta: dict[str, Any] = dict(metadata) if metadata else {} + if model is not None: + suite_meta["model"] = model.name + suite_meta["provider"] = model.provider + super().__init__( + name=name, + kind=SuiteKind.EVAL, + tags=tags, + max_concurrency=max_concurrency, + description=description, + metadata=suite_meta, + ) + self._judge: Judge | None = judge + self._model = model + + @property + def judge(self) -> Judge | None: + return self._judge + + @property + def model(self) -> ModelInfo | None: + return self._model + + def eval( + self, + evaluators: list[Any] | None = None, + expected_key: str = "expected", + tags: list[str] | None = None, + timeout: float | None = None, + judge: Any = None, + ) -> Callable[[FuncT], FuncT]: + """Register a scored eval test on this suite.""" + + def decorator(func: FuncT) -> FuncT: + resolved_judge = judge or self._judge + wrapper = make_eval_wrapper( + func, + evaluators or [], + expected_key, + judge=resolved_judge, + ) + self.test(tags=tags, timeout=timeout, is_eval=True)(wrapper) + return func + + return decorator diff --git a/tests/evals/test_e2e.py b/tests/evals/test_e2e.py index 72ef8ff..2ecc09b 100644 --- a/tests/evals/test_e2e.py +++ b/tests/evals/test_e2e.py @@ -1,7 +1,7 @@ """End-to-end tests for ProTest evals integration. These tests define the PUBLIC API contract. They test what the user sees: -- Session setup (EvalSession, @session.eval with ForEach/From) +- Session setup (EvalSession, EvalSuite + @suite.eval with ForEach/From) - CLI behavior (protest run vs protest eval) - Output format (scores table, trends, failure messages) - History (JSONL format, stats, significance, clean-dirty) @@ -26,7 +26,6 @@ from protest.entities import SuiteKind from protest.evals import ( EvalContext, - EvalSession, Metric, ModelInfo, ShortCircuit, @@ -46,6 +45,8 @@ ) from protest.evals.hashing import compute_case_hash, compute_eval_hash from protest.evals.results_writer import EvalResultsWriter +from protest.evals.session import EvalSession +from protest.evals.suite import EvalSuite from protest.evals.types import EvalSuiteReport # noqa: TC001 — used at runtime from protest.filters.kind import KindFilterPlugin from protest.history.storage import append_entry, clean_dirty @@ -103,12 +104,15 @@ async def async_echo_task(text: str) -> str: class TestEvalSession: - """EvalSession setup: constructor with model=, @session.eval.""" + """EvalSession setup: constructor with model=, EvalSuite + @suite.eval.""" def test_add_eval_creates_eval_kind(self) -> None: session = EvalSession() - @session.eval(evaluators=[fake_accuracy]) + eval_echo_suite = EvalSuite("eval_echo") + session.add_suite(eval_echo_suite) + + @eval_echo_suite.eval(evaluators=[fake_accuracy]) def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: return echo_task(case["inputs"]) @@ -129,7 +133,10 @@ def test_eval_with_bool_verdict(self) -> None: """Evaluator with bool field: case_fail has matches_expected=False -> fail.""" session = EvalSession() - @session.eval(evaluators=[fake_accuracy]) + eval_echo_suite = EvalSuite("eval_echo") + session.add_suite(eval_echo_suite) + + @eval_echo_suite.eval(evaluators=[fake_accuracy]) def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: return echo_task(case["inputs"]) @@ -142,7 +149,10 @@ def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: def test_async_task_works(self) -> None: session = EvalSession() - @session.eval(evaluators=[fake_accuracy]) + eval_echo_suite = EvalSuite("eval_echo") + session.add_suite(eval_echo_suite) + + @eval_echo_suite.eval(evaluators=[fake_accuracy]) async def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: return await async_echo_task(case["inputs"]) @@ -160,7 +170,10 @@ def test_async_evaluator_does_not_crash(self) -> None: session = EvalSession() - @session.eval(evaluators=[async_fake_accuracy]) + eval_echo_suite = EvalSuite("eval_echo") + session.add_suite(eval_echo_suite) + + @eval_echo_suite.eval(evaluators=[async_fake_accuracy]) def eval_echo(case: Annotated[dict, From(single_case)]) -> str: return echo_task(case["inputs"]) @@ -184,7 +197,10 @@ def test_test_suite_has_kind_test(self) -> None: def test_eval_suite_has_kind_eval(self) -> None: session = EvalSession() - @session.eval(evaluators=[fake_accuracy]) + eval_echo_suite = EvalSuite("eval_echo") + session.add_suite(eval_echo_suite) + + @eval_echo_suite.eval(evaluators=[fake_accuracy]) def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: return echo_task(case["inputs"]) @@ -229,7 +245,10 @@ def test_a() -> None: session.add_suite(test_suite) - @session.eval(evaluators=[fake_accuracy]) + eval_echo_suite = EvalSuite("eval_echo") + session.add_suite(eval_echo_suite) + + @eval_echo_suite.eval(evaluators=[fake_accuracy]) def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: return echo_task(case["inputs"]) @@ -251,7 +270,10 @@ def test_a() -> None: session.add_suite(test_suite) - @session.eval(evaluators=[fake_accuracy]) + eval_echo_suite = EvalSuite("eval_echo") + session.add_suite(eval_echo_suite) + + @eval_echo_suite.eval(evaluators=[fake_accuracy]) def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: return echo_task(case["inputs"]) @@ -286,7 +308,10 @@ def on_eval_suite_end(self, report: Any) -> None: session = EvalSession() session.register_plugin(ReportCapture()) - @session.eval(evaluators=[fake_accuracy]) + eval_echo_suite = EvalSuite("eval_echo") + session.add_suite(eval_echo_suite) + + @eval_echo_suite.eval(evaluators=[fake_accuracy]) def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: return echo_task(case["inputs"]) @@ -311,7 +336,10 @@ def on_eval_suite_end(self, report: Any) -> None: session = EvalSession() session.register_plugin(ReportCapture()) - @session.eval(evaluators=[fake_accuracy]) + eval_echo_suite = EvalSuite("eval_echo") + session.add_suite(eval_echo_suite) + + @eval_echo_suite.eval(evaluators=[fake_accuracy]) def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: return echo_task(case["inputs"]) @@ -335,7 +363,10 @@ def on_test_fail(self, result: Any) -> None: session = EvalSession() session.register_plugin(ErrorCollector()) - @session.eval(evaluators=[fake_accuracy]) + eval_echo_suite = EvalSuite("eval_echo") + session.add_suite(eval_echo_suite) + + @eval_echo_suite.eval(evaluators=[fake_accuracy]) def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: return echo_task(case["inputs"]) @@ -368,7 +399,10 @@ def on_test_fail(self, result: Any) -> None: session = EvalSession() session.register_plugin(Collector()) - @session.eval(evaluators=[fake_accuracy]) + eval_echo_suite = EvalSuite("eval_echo") + session.add_suite(eval_echo_suite) + + @eval_echo_suite.eval(evaluators=[fake_accuracy]) def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: return echo_task(case["inputs"]) @@ -400,7 +434,10 @@ def on_test_teardown_start(self, info: Any) -> None: session = EvalSession() session.register_plugin(LifecycleCollector()) - @session.eval(evaluators=[fake_accuracy]) + eval_echo_suite = EvalSuite("eval_echo") + session.add_suite(eval_echo_suite) + + @eval_echo_suite.eval(evaluators=[fake_accuracy]) def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: return echo_task(case["inputs"]) @@ -437,7 +474,10 @@ def crashing_evaluator(ctx: EvalContext) -> bool: session = EvalSession() session.register_plugin(Collector()) - @session.eval(evaluators=[crashing_evaluator]) + eval_echo_suite = EvalSuite("eval_echo") + session.add_suite(eval_echo_suite) + + @eval_echo_suite.eval(evaluators=[crashing_evaluator]) def eval_echo(case: Annotated[dict, From(single_case)]) -> str: return echo_task(case["inputs"]) @@ -483,7 +523,10 @@ class TestHistory: def _run_eval(self, tmp_path: Path) -> None: session = EvalSession(model=ModelInfo(name="test-model"), history_dir=tmp_path) - @session.eval(evaluators=[fake_accuracy]) + eval_echo_suite = EvalSuite("eval_echo") + session.add_suite(eval_echo_suite) + + @eval_echo_suite.eval(evaluators=[fake_accuracy]) def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: return echo_task(case["inputs"]) @@ -546,7 +589,10 @@ def test_history_metadata_included(self, tmp_path: Path) -> None: metadata={"env": "test", "version": "1.0"}, ) - @session.eval(evaluators=[fake_accuracy]) + eval_echo_suite = EvalSuite("eval_echo") + session.add_suite(eval_echo_suite) + + @eval_echo_suite.eval(evaluators=[fake_accuracy]) def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: return echo_task(case["inputs"]) @@ -610,7 +656,10 @@ def test_case_hash_stored_in_history(self, tmp_path: Path) -> None: """History entries include case_hash and eval_hash per case.""" session = EvalSession(history_dir=tmp_path) - @session.eval(evaluators=[fake_accuracy]) + eval_echo_suite = EvalSuite("eval_echo") + session.add_suite(eval_echo_suite) + + @eval_echo_suite.eval(evaluators=[fake_accuracy]) def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: return echo_task(case["inputs"]) @@ -749,7 +798,10 @@ def on_test_fail(self, result: Any) -> None: session = EvalSession() session.register_plugin(Collector()) - @session.eval(evaluators=[not_empty]) + eval_echo_suite = EvalSuite("eval_echo") + session.add_suite(eval_echo_suite) + + @eval_echo_suite.eval(evaluators=[not_empty]) def eval_echo(case: Annotated[dict, From(single_case)]) -> str: return echo_task(case["inputs"]) @@ -783,7 +835,10 @@ def on_test_fail(self, result: Any) -> None: session = EvalSession() session.register_plugin(Collector()) - @session.eval(evaluators=[word_overlap]) + eval_echo_suite = EvalSuite("eval_echo") + session.add_suite(eval_echo_suite) + + @eval_echo_suite.eval(evaluators=[word_overlap]) def eval_echo(case: Annotated[dict, From(single_case)]) -> str: return echo_task(case["inputs"]) @@ -815,7 +870,10 @@ def bad_evaluator(ctx: EvalContext) -> float: session = EvalSession() session.register_plugin(Collector()) - @session.eval(evaluators=[bad_evaluator]) + eval_echo_suite = EvalSuite("eval_echo") + session.add_suite(eval_echo_suite) + + @eval_echo_suite.eval(evaluators=[bad_evaluator]) def eval_echo(case: Annotated[dict, From(single_case)]) -> str: return echo_task(case["inputs"]) @@ -844,7 +902,10 @@ def expensive(ctx: EvalContext) -> bool: session = EvalSession() - @session.eval(evaluators=[ShortCircuit([cheap, expensive])]) + eval_echo_suite = EvalSuite("eval_echo") + session.add_suite(eval_echo_suite) + + @eval_echo_suite.eval(evaluators=[ShortCircuit([cheap, expensive])]) def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: return echo_task(case["inputs"]) @@ -874,7 +935,10 @@ def check_b(ctx: EvalContext) -> bool: ) session = EvalSession() - @session.eval(evaluators=[ShortCircuit([check_a, check_b])]) + eval_echo_suite = EvalSuite("eval_echo") + session.add_suite(eval_echo_suite) + + @eval_echo_suite.eval(evaluators=[ShortCircuit([check_a, check_b])]) def eval_echo(case: Annotated[dict, From(single)]) -> str: return echo_task(case["inputs"]) @@ -899,7 +963,10 @@ def _run_eval(self, tmp_path: Path) -> Path: writer = EvalResultsWriter(history_dir=tmp_path) session.register_plugin(writer) - @session.eval(evaluators=[fake_accuracy]) + eval_echo_suite = EvalSuite("eval_echo") + session.add_suite(eval_echo_suite) + + @eval_echo_suite.eval(evaluators=[fake_accuracy]) def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: return echo_task(case["inputs"]) @@ -947,7 +1014,7 @@ def test_case_file_contains_inputs(self, tmp_path: Path) -> None: class TestMultiDatasetHistory: - """Multiple @session.eval calls produce distinct suites in history.""" + """Multiple EvalSuite + @suite.eval calls produce distinct suites in history.""" def _run_multi(self, tmp_path: Path) -> dict[str, Any]: pipeline_cases = ForEach( @@ -966,11 +1033,17 @@ def _run_multi(self, tmp_path: Path) -> dict[str, Any]: session = EvalSession(history_dir=tmp_path) - @session.eval(evaluators=[fake_accuracy]) + pipeline_suite = EvalSuite("pipeline") + session.add_suite(pipeline_suite) + + @pipeline_suite.eval(evaluators=[fake_accuracy]) def pipeline(case: Annotated[dict, From(pipeline_cases)]) -> str: return echo_task(case["inputs"]) - @session.eval(evaluators=[fake_accuracy]) + ingest_suite = EvalSuite("ingest") + session.add_suite(ingest_suite) + + @ingest_suite.eval(evaluators=[fake_accuracy]) def ingest(case: Annotated[dict, From(ingest_cases)]) -> str: return echo_task(case["inputs"]) @@ -996,14 +1069,17 @@ def test_each_suite_has_its_own_cases(self, tmp_path: Path) -> None: class TestEvalTaskFixtures: - """@session.eval() peut utiliser des fixtures protest via Use().""" + """EvalSuite + @suite.eval() peut utiliser des fixtures protest via Use().""" def test_task_without_fixtures_still_works(self) -> None: # basic_cases has one match (case_pass) and one mismatch (case_fail) # fake_accuracy returns matches_expected=False for case_fail -> fail session = EvalSession() - @session.eval(evaluators=[fake_accuracy]) + eval_echo_suite = EvalSuite("eval_echo") + session.add_suite(eval_echo_suite) + + @eval_echo_suite.eval(evaluators=[fake_accuracy]) def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: return echo_task(case["inputs"]) @@ -1028,7 +1104,10 @@ def prefix_service() -> str: session = EvalSession() session.bind(prefix_service) - @session.eval(evaluators=[fake_accuracy]) + eval_prefixed_suite = EvalSuite("eval_prefixed") + session.add_suite(eval_prefixed_suite) + + @eval_prefixed_suite.eval(evaluators=[fake_accuracy]) async def eval_prefixed( case: Annotated[dict, From(single_case)], svc: Annotated[str, Use(prefix_service)], @@ -1063,7 +1142,10 @@ def expensive_resource() -> str: session = EvalSession() session.bind(expensive_resource) - @session.eval(evaluators=[fake_accuracy]) + eval_resource_suite = EvalSuite("eval_resource") + session.add_suite(eval_resource_suite) + + @eval_resource_suite.eval(evaluators=[fake_accuracy]) async def eval_resource( case: Annotated[dict, From(multi_cases)], res: Annotated[str, Use(expensive_resource)], diff --git a/tests/evals/test_judge.py b/tests/evals/test_judge.py index 9e6fd11..7eece34 100644 --- a/tests/evals/test_judge.py +++ b/tests/evals/test_judge.py @@ -11,13 +11,14 @@ from protest.core.runner import TestRunner from protest.evals import ( EvalContext, - EvalSession, Judge, JudgeResponse, TaskResult, Verdict, evaluator, ) +from protest.evals.session import EvalSession +from protest.evals.suite import EvalSuite from protest.plugin import PluginBase # --------------------------------------------------------------------------- @@ -227,8 +228,10 @@ async def judge_evaluator(ctx: EvalContext) -> bool: return await ctx.judge("pass this", bool) session = EvalSession(judge=FakeJudge()) + eval_echo_suite = EvalSuite("eval_echo") + session.add_suite(eval_echo_suite) - @session.eval(evaluators=[judge_evaluator]) + @eval_echo_suite.eval(evaluators=[judge_evaluator]) def eval_echo(case: Annotated[dict, From(single_case)]) -> str: return case["inputs"] @@ -244,8 +247,10 @@ async def needs_judge(ctx: EvalContext) -> bool: return await ctx.judge("test", bool) session = EvalSession() # no judge + eval_echo_suite = EvalSuite("eval_echo") + session.add_suite(eval_echo_suite) - @session.eval(evaluators=[needs_judge]) + @eval_echo_suite.eval(evaluators=[needs_judge]) def eval_echo(case: Annotated[dict, From(single_case)]) -> str: return case["inputs"] @@ -274,8 +279,10 @@ async def double_judge(ctx: EvalContext) -> bool: return r1 and r2 session = EvalSession(judge=FakeJudge()) + eval_echo_suite = EvalSuite("eval_echo") + session.add_suite(eval_echo_suite) - @session.eval(evaluators=[double_judge]) + @eval_echo_suite.eval(evaluators=[double_judge]) def eval_echo(case: Annotated[dict, From(single_case)]) -> str: return case["inputs"] @@ -329,8 +336,10 @@ async def struct_evaluator(ctx: EvalContext) -> JudgeVerdict: return await ctx.judge("evaluate this", JudgeVerdict) session = EvalSession(judge=StructuredJudge()) + eval_echo_suite = EvalSuite("eval_echo") + session.add_suite(eval_echo_suite) - @session.eval(evaluators=[struct_evaluator]) + @eval_echo_suite.eval(evaluators=[struct_evaluator]) def eval_echo(case: Annotated[dict, From(single_case)]) -> str: return case["inputs"] @@ -353,8 +362,10 @@ def check_output(ctx: EvalContext) -> bool: return ctx.output == "hello" # sees str, not TaskResult session = EvalSession() + eval_echo_suite = EvalSuite("eval_echo") + session.add_suite(eval_echo_suite) - @session.eval(evaluators=[check_output]) + @eval_echo_suite.eval(evaluators=[check_output]) def eval_echo(case: Annotated[dict, From(single_case)]) -> TaskResult[str]: return TaskResult( output=case["inputs"], @@ -375,8 +386,10 @@ def always_pass(ctx: EvalContext) -> bool: return True session = EvalSession() + eval_echo_suite = EvalSuite("eval_echo") + session.add_suite(eval_echo_suite) - @session.eval(evaluators=[always_pass]) + @eval_echo_suite.eval(evaluators=[always_pass]) def eval_echo(case: Annotated[dict, From(single_case)]) -> TaskResult[str]: return TaskResult( output=case["inputs"], @@ -411,8 +424,10 @@ def always_pass(ctx: EvalContext) -> bool: return True session = EvalSession() + eval_echo_suite = EvalSuite("eval_echo") + session.add_suite(eval_echo_suite) - @session.eval(evaluators=[always_pass]) + @eval_echo_suite.eval(evaluators=[always_pass]) def eval_echo(case: Annotated[dict, From(single_case)]) -> str: return case["inputs"] From 62a12a304b3367397008a7920344895a2df40f1a Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Fri, 3 Apr 2026 06:55:13 +0200 Subject: [PATCH 21/60] refactor(evals): replace `dict` with `EvalCase` for eval cases, update APIs and tests - Standardized eval cases by replacing untyped `dict` with `EvalCase` objects across codebase. - Updated evaluator helpers to work exclusively with `EvalCase` instances. - Refactored `make_eval_wrapper` to remove unused `expected_key` argument. - Updated tests and examples to adopt `EvalCase` usage for improved type safety and code clarity. --- docs/evals.md | 60 +++++++++++--- protest/evals/evaluator.py | 2 +- protest/evals/session.py | 4 +- protest/evals/suite.py | 6 +- protest/evals/types.py | 4 +- protest/evals/wrapper.py | 74 +++++++---------- tests/evals/test_e2e.py | 157 +++++++++++++++++++------------------ 7 files changed, 164 insertions(+), 143 deletions(-) diff --git a/docs/evals.md b/docs/evals.md index 006c403..cca70cc 100644 --- a/docs/evals.md +++ b/docs/evals.md @@ -15,8 +15,10 @@ ProTest evals use the same infrastructure as tests: fixtures, DI, parallelism, t from typing import Annotated from protest import ForEach, From -from protest.evals import EvalCase, EvalSession, ModelInfo, evaluator +from protest.evals import EvalCase, ModelInfo, evaluator from protest.evals.evaluators import contains_keywords +from protest.evals.session import EvalSession +from protest.evals.suite import EvalSuite cases = ForEach([ EvalCase(inputs="Who is Marie?", expected="Marie, Resistance", name="lookup"), @@ -25,7 +27,10 @@ cases = ForEach([ session = EvalSession(model=ModelInfo(name="gpt-4o-mini")) -@session.eval(evaluators=[contains_keywords(keywords=["Marie"])]) +chatbot_suite = EvalSuite("chatbot") +session.add_suite(chatbot_suite) + +@chatbot_suite.eval(evaluators=[contains_keywords(keywords=["Marie"])]) async def chatbot(case: Annotated[EvalCase, From(cases)]) -> str: return await my_agent(case.inputs) ``` @@ -36,7 +41,7 @@ protest eval evals.session:session ## How It Works -`@session.eval()` wraps a function to run evaluators on its return value: +`@suite.eval()` wraps a function to run evaluators on its return value: 1. Your function receives case data via `ForEach`/`From` (same as parameterized tests) 2. It returns the output (string, object, anything) @@ -48,18 +53,40 @@ The rest of the pipeline — fixtures, DI, parallelism, reporters — works iden ## EvalSession -`EvalSession` is a session configured for evals. History is enabled by default. +`EvalSession` is a session configured for evals. History is enabled by default. Model and judge set on the session are propagated as defaults to `EvalSuite` instances added via `session.add_suite()`. ```python -from protest.evals import EvalSession, ModelInfo +from protest.evals import ModelInfo +from protest.evals.session import EvalSession session = EvalSession( - model=ModelInfo(name="gpt-4o-mini"), # tracked in history + model=ModelInfo(name="gpt-4o-mini"), # propagated to suites, tracked in history concurrency=4, # parallel eval cases metadata={"version": "1.0"}, # stored in history ) ``` +## EvalSuite + +`EvalSuite` groups eval cases. It's the eval equivalent of `ProTestSuite` — it forces `kind=EVAL` and carries model/judge configuration. + +```python +from protest.evals.suite import EvalSuite + +chatbot_suite = EvalSuite("chatbot") +session.add_suite(chatbot_suite) # model/judge propagated from session + +@chatbot_suite.eval(evaluators=[my_scorer]) +async def chatbot(case: Annotated[EvalCase, From(cases)]) -> str: + return await my_agent(case.inputs) +``` + +Per-suite model override: + +```python +chatbot_suite = EvalSuite("chatbot", model=ModelInfo(name="mistral-7b")) +``` + ## EvalCase Typed dataclass for eval case data. Provides IDE autocompletion instead of untyped dicts. @@ -262,7 +289,10 @@ async def pipeline(): session.bind(pipeline) -@session.eval(evaluators=[my_scorer]) +pipeline_suite = EvalSuite("pipeline") +session.add_suite(pipeline_suite) + +@pipeline_suite.eval(evaluators=[my_scorer]) async def pipeline_eval( case: Annotated[EvalCase, From(cases)], driver: Annotated[AsyncDriver, Use(pipeline)], @@ -386,7 +416,7 @@ If your eval task calls an LLM, you can report usage by returning `TaskResult` i ```python from protest.evals import TaskResult -@session.eval(evaluators=[my_scorer]) +@chatbot_suite.eval(evaluators=[my_scorer]) async def chatbot(case: Annotated[EvalCase, From(cases)]) -> TaskResult[str]: result = await agent.run(case.inputs) usage = result.usage() @@ -424,18 +454,24 @@ If two evaluators return dataclasses with the same field name (e.g. both have `a ## Multi-Model Sessions -Track which model produced each eval suite's results: +Track which model produced each eval suite's results. Each `EvalSuite` can have its own model: ```python pipeline_model = ModelInfo(name="qwen-2.5") chat_model = ModelInfo(name="mistral-7b") -session = EvalSession(model=pipeline_model) +session = EvalSession(model=pipeline_model) # default model + +pipeline_suite = EvalSuite("pipeline") # inherits pipeline_model from session +chatbot_suite = EvalSuite("chatbot", model=chat_model) # override + +session.add_suite(pipeline_suite) +session.add_suite(chatbot_suite) -@session.eval(evaluators=[...], name="pipeline", model=pipeline_model) +@pipeline_suite.eval(evaluators=[...]) async def pipeline_eval(case, driver) -> str: ... -@session.eval(evaluators=[...], name="chatbot", model=chat_model) +@chatbot_suite.eval(evaluators=[...]) async def chatbot_eval(case, deps) -> str: ... ``` diff --git a/protest/evals/evaluator.py b/protest/evals/evaluator.py index 6d0c980..07dc2f2 100644 --- a/protest/evals/evaluator.py +++ b/protest/evals/evaluator.py @@ -133,7 +133,7 @@ class EvalCase: EvalCase(inputs="Who is Pierre?", expected="Pierre, arrest"), ]) - @session.eval(evaluators=[contains_facts]) + @suite.eval(evaluators=[contains_facts]) def my_eval(case: Annotated[EvalCase, From(cases)]) -> str: return ask(case.inputs) """ diff --git a/protest/evals/session.py b/protest/evals/session.py index 9ed7459..c85527a 100644 --- a/protest/evals/session.py +++ b/protest/evals/session.py @@ -30,8 +30,8 @@ class EvalSession(ProTestSession): session.add_suite(chatbot) @chatbot.eval(evaluators=[contains_facts]) - async def chatbot(case: Annotated[dict, From(cases)]) -> str: - return await ask(case["q"]) + async def chatbot(case: Annotated[EvalCase, From(cases)]) -> str: + return await ask(case.inputs) """ def __init__( diff --git a/protest/evals/suite.py b/protest/evals/suite.py index 279aec7..f0aba7e 100644 --- a/protest/evals/suite.py +++ b/protest/evals/suite.py @@ -25,8 +25,8 @@ class EvalSuite(ProTestSuite): session.add_suite(chatbot) @chatbot.eval(evaluators=[contains_facts]) - async def chatbot(case: Annotated[dict, From(cases)]) -> str: - return await ask(case["q"]) + async def chatbot(case: Annotated[EvalCase, From(cases)]) -> str: + return await ask(case.inputs) """ def __init__( @@ -66,7 +66,6 @@ def model(self) -> ModelInfo | None: def eval( self, evaluators: list[Any] | None = None, - expected_key: str = "expected", tags: list[str] | None = None, timeout: float | None = None, judge: Any = None, @@ -78,7 +77,6 @@ def decorator(func: FuncT) -> FuncT: wrapper = make_eval_wrapper( func, evaluators or [], - expected_key, judge=resolved_judge, ) self.test(tags=tags, timeout=timeout, is_eval=True)(wrapper) diff --git a/protest/evals/types.py b/protest/evals/types.py index 59d2721..08543c6 100644 --- a/protest/evals/types.py +++ b/protest/evals/types.py @@ -19,8 +19,8 @@ class TaskResult(Generic[T]): Usage:: - @session.eval(evaluators=[...]) - async def my_eval(case) -> TaskResult[str]: + @suite.eval(evaluators=[...]) + async def my_eval(case: EvalCase) -> TaskResult[str]: result = await agent.run(case.inputs) usage = result.usage() return TaskResult( diff --git a/protest/evals/wrapper.py b/protest/evals/wrapper.py index bc2569b..e9161f1 100644 --- a/protest/evals/wrapper.py +++ b/protest/evals/wrapper.py @@ -14,6 +14,7 @@ from protest.entities.events import EvalPayload, EvalScoreEntry from protest.evals.evaluator import ( + EvalCase, EvalContext, ShortCircuit, extract_scores_from_result, @@ -26,14 +27,13 @@ def make_eval_wrapper( func: Any, evaluators: list[Any], - expected_key: str, judge: Any = None, ) -> Any: """Wrap a function to run evaluators on its return value.""" @functools.wraps(func) async def eval_wrapper(**kwargs: Any) -> EvalPayload: - expected = _extract_expected(kwargs, expected_key) + expected = _extract_expected(kwargs) case_name = _extract_case_name(kwargs, func.__name__) inputs = _extract_inputs(kwargs) metadata = _extract_metadata(kwargs) @@ -102,65 +102,51 @@ async def eval_wrapper(**kwargs: Any) -> EvalPayload: # --------------------------------------------------------------------------- -# Extract helpers — pull data from case_kwargs (dict or dataclass) +# Extract helpers — pull EvalCase from kwargs # --------------------------------------------------------------------------- -def _get(obj: Any, key: str, default: Any = None) -> Any: - """Get a value from a dict or dataclass by key/attr name.""" - if isinstance(obj, dict): - return obj.get(key, default) - return getattr(obj, key, default) - - -def _is_case_data(v: Any) -> bool: - """Check if a value looks like case data (dict or has 'expected'/'q'/'inputs').""" - if isinstance(v, dict): - return True - return hasattr(v, "expected") or hasattr(v, "q") or hasattr(v, "inputs") - - -def _extract_expected(kwargs: dict[str, Any], key: str) -> Any: +def _find_case(kwargs: dict[str, Any]) -> EvalCase | None: + """Find the EvalCase instance in kwargs.""" for v in kwargs.values(): - if _is_case_data(v): - val = _get(v, key) - if val is not None: - return val + if isinstance(v, EvalCase): + return v return None +def _extract_expected(kwargs: dict[str, Any]) -> Any: + case = _find_case(kwargs) + if case is None: + return None + return case.expected + + def _extract_case_name(kwargs: dict[str, Any], fallback: str) -> str: - for v in kwargs.values(): - if _is_case_data(v): - name = _get(v, "name") - if name: - return str(name) - return fallback + case = _find_case(kwargs) + if case is None or not case.name: + return fallback + return case.name def _extract_inputs(kwargs: dict[str, Any]) -> Any: - for v in kwargs.values(): - if _is_case_data(v): - return _get(v, "inputs") or _get(v, "q") or _get(v, "input") - return None + case = _find_case(kwargs) + if case is None: + return None + return case.inputs def _extract_metadata(kwargs: dict[str, Any]) -> Any: - for v in kwargs.values(): - if _is_case_data(v): - val = _get(v, "metadata") - if val is not None: - return val - return None + case = _find_case(kwargs) + if case is None: + return None + return case.metadata or None def _extract_per_case_evaluators(kwargs: dict[str, Any]) -> list[Any]: - for v in kwargs.values(): - if _is_case_data(v): - evs = _get(v, "evaluators") - if evs: - return list(evs) - return [] + case = _find_case(kwargs) + if case is None or not case.evaluators: + return [] + return list(case.evaluators) # --------------------------------------------------------------------------- diff --git a/tests/evals/test_e2e.py b/tests/evals/test_e2e.py index 2ecc09b..fc35686 100644 --- a/tests/evals/test_e2e.py +++ b/tests/evals/test_e2e.py @@ -25,6 +25,7 @@ from protest.core.suite import ProTestSuite from protest.entities import SuiteKind from protest.evals import ( + EvalCase, EvalContext, Metric, ModelInfo, @@ -91,10 +92,10 @@ async def async_echo_task(text: str) -> str: basic_cases = ForEach( [ - {"inputs": "hello world", "expected": "hello", "name": "case_pass"}, - {"inputs": "xyz", "expected": "notfound", "name": "case_fail"}, + EvalCase(inputs="hello world", expected="hello", name="case_pass"), + EvalCase(inputs="xyz", expected="notfound", name="case_fail"), ], - ids=lambda c: c["name"], + ids=lambda c: c.name, ) @@ -113,8 +114,8 @@ def test_add_eval_creates_eval_kind(self) -> None: session.add_suite(eval_echo_suite) @eval_echo_suite.eval(evaluators=[fake_accuracy]) - def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: - return echo_task(case["inputs"]) + def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str: + return echo_task(case.inputs) # The session should have a suite with kind=eval assert len(session._suites) > 0 @@ -137,8 +138,8 @@ def test_eval_with_bool_verdict(self) -> None: session.add_suite(eval_echo_suite) @eval_echo_suite.eval(evaluators=[fake_accuracy]) - def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: - return echo_task(case["inputs"]) + def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str: + return echo_task(case.inputs) runner = TestRunner(session) result = runner.run() @@ -153,8 +154,8 @@ def test_async_task_works(self) -> None: session.add_suite(eval_echo_suite) @eval_echo_suite.eval(evaluators=[fake_accuracy]) - async def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: - return await async_echo_task(case["inputs"]) + async def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str: + return await async_echo_task(case.inputs) runner = TestRunner(session) runner.run() @@ -163,9 +164,9 @@ def test_async_evaluator_does_not_crash(self) -> None: """Regression: async evaluator called via evaluate_sync raised 'event loop already running'.""" single_case = ForEach( [ - {"inputs": "hello world", "expected": "hello", "name": "c1"}, + EvalCase(inputs="hello world", expected="hello", name="c1"), ], - ids=lambda c: c["name"], + ids=lambda c: c.name, ) session = EvalSession() @@ -174,8 +175,8 @@ def test_async_evaluator_does_not_crash(self) -> None: session.add_suite(eval_echo_suite) @eval_echo_suite.eval(evaluators=[async_fake_accuracy]) - def eval_echo(case: Annotated[dict, From(single_case)]) -> str: - return echo_task(case["inputs"]) + def eval_echo(case: Annotated[EvalCase, From(single_case)]) -> str: + return echo_task(case.inputs) runner = TestRunner(session) result = runner.run() @@ -201,8 +202,8 @@ def test_eval_suite_has_kind_eval(self) -> None: session.add_suite(eval_echo_suite) @eval_echo_suite.eval(evaluators=[fake_accuracy]) - def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: - return echo_task(case["inputs"]) + def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str: + return echo_task(case.inputs) assert any(s.kind == "eval" for s in session._suites) @@ -249,8 +250,8 @@ def test_a() -> None: session.add_suite(eval_echo_suite) @eval_echo_suite.eval(evaluators=[fake_accuracy]) - def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: - return echo_task(case["inputs"]) + def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str: + return echo_task(case.inputs) ctx = PluginContext(args={"kind_filter": "test"}) run_session(session, ctx=ctx) @@ -274,8 +275,8 @@ def test_a() -> None: session.add_suite(eval_echo_suite) @eval_echo_suite.eval(evaluators=[fake_accuracy]) - def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: - return echo_task(case["inputs"]) + def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str: + return echo_task(case.inputs) ctx = PluginContext(args={"kind_filter": "eval"}) run_session(session, ctx=ctx) @@ -312,8 +313,8 @@ def on_eval_suite_end(self, report: Any) -> None: session.add_suite(eval_echo_suite) @eval_echo_suite.eval(evaluators=[fake_accuracy]) - def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: - return echo_task(case["inputs"]) + def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str: + return echo_task(case.inputs) runner = TestRunner(session) runner.run() @@ -340,8 +341,8 @@ def on_eval_suite_end(self, report: Any) -> None: session.add_suite(eval_echo_suite) @eval_echo_suite.eval(evaluators=[fake_accuracy]) - def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: - return echo_task(case["inputs"]) + def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str: + return echo_task(case.inputs) runner = TestRunner(session) runner.run() @@ -367,8 +368,8 @@ def on_test_fail(self, result: Any) -> None: session.add_suite(eval_echo_suite) @eval_echo_suite.eval(evaluators=[fake_accuracy]) - def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: - return echo_task(case["inputs"]) + def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str: + return echo_task(case.inputs) run_session(session) @@ -403,8 +404,8 @@ def on_test_fail(self, result: Any) -> None: session.add_suite(eval_echo_suite) @eval_echo_suite.eval(evaluators=[fake_accuracy]) - def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: - return echo_task(case["inputs"]) + def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str: + return echo_task(case.inputs) runner = TestRunner(session) runner.run() @@ -438,8 +439,8 @@ def on_test_teardown_start(self, info: Any) -> None: session.add_suite(eval_echo_suite) @eval_echo_suite.eval(evaluators=[fake_accuracy]) - def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: - return echo_task(case["inputs"]) + def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str: + return echo_task(case.inputs) runner = TestRunner(session) runner.run() @@ -466,9 +467,9 @@ def crashing_evaluator(ctx: EvalContext) -> bool: single_case = ForEach( [ - {"inputs": "hello", "expected": "hello", "name": "c1"}, + EvalCase(inputs="hello", expected="hello", name="c1"), ], - ids=lambda c: c["name"], + ids=lambda c: c.name, ) session = EvalSession() @@ -478,8 +479,8 @@ def crashing_evaluator(ctx: EvalContext) -> bool: session.add_suite(eval_echo_suite) @eval_echo_suite.eval(evaluators=[crashing_evaluator]) - def eval_echo(case: Annotated[dict, From(single_case)]) -> str: - return echo_task(case["inputs"]) + def eval_echo(case: Annotated[EvalCase, From(single_case)]) -> str: + return echo_task(case.inputs) runner = TestRunner(session) runner.run() @@ -527,8 +528,8 @@ def _run_eval(self, tmp_path: Path) -> None: session.add_suite(eval_echo_suite) @eval_echo_suite.eval(evaluators=[fake_accuracy]) - def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: - return echo_task(case["inputs"]) + def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str: + return echo_task(case.inputs) run_session(session) @@ -593,8 +594,8 @@ def test_history_metadata_included(self, tmp_path: Path) -> None: session.add_suite(eval_echo_suite) @eval_echo_suite.eval(evaluators=[fake_accuracy]) - def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: - return echo_task(case["inputs"]) + def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str: + return echo_task(case.inputs) run_session(session) @@ -660,8 +661,8 @@ def test_case_hash_stored_in_history(self, tmp_path: Path) -> None: session.add_suite(eval_echo_suite) @eval_echo_suite.eval(evaluators=[fake_accuracy]) - def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: - return echo_task(case["inputs"]) + def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str: + return echo_task(case.inputs) run_session(session) @@ -790,9 +791,9 @@ def on_test_fail(self, result: Any) -> None: single_case = ForEach( [ - {"inputs": "hello world", "expected": "hello", "name": "c1"}, + EvalCase(inputs="hello world", expected="hello", name="c1"), ], - ids=lambda c: c["name"], + ids=lambda c: c.name, ) session = EvalSession() @@ -802,8 +803,8 @@ def on_test_fail(self, result: Any) -> None: session.add_suite(eval_echo_suite) @eval_echo_suite.eval(evaluators=[not_empty]) - def eval_echo(case: Annotated[dict, From(single_case)]) -> str: - return echo_task(case["inputs"]) + def eval_echo(case: Annotated[EvalCase, From(single_case)]) -> str: + return echo_task(case.inputs) runner = TestRunner(session) result = runner.run() @@ -827,9 +828,9 @@ def on_test_fail(self, result: Any) -> None: single_case = ForEach( [ - {"inputs": "foo", "expected": "bar baz", "name": "c1"}, + EvalCase(inputs="foo", expected="bar baz", name="c1"), ], - ids=lambda c: c["name"], + ids=lambda c: c.name, ) session = EvalSession() @@ -839,8 +840,8 @@ def on_test_fail(self, result: Any) -> None: session.add_suite(eval_echo_suite) @eval_echo_suite.eval(evaluators=[word_overlap]) - def eval_echo(case: Annotated[dict, From(single_case)]) -> str: - return echo_task(case["inputs"]) + def eval_echo(case: Annotated[EvalCase, From(single_case)]) -> str: + return echo_task(case.inputs) runner = TestRunner(session) result = runner.run() @@ -863,8 +864,8 @@ def bad_evaluator(ctx: EvalContext) -> float: return 0.5 single_case = ForEach( - [{"inputs": "hello", "expected": "hello", "name": "c1"}], - ids=lambda c: c["name"], + [EvalCase(inputs="hello", expected="hello", name="c1")], + ids=lambda c: c.name, ) session = EvalSession() @@ -874,8 +875,8 @@ def bad_evaluator(ctx: EvalContext) -> float: session.add_suite(eval_echo_suite) @eval_echo_suite.eval(evaluators=[bad_evaluator]) - def eval_echo(case: Annotated[dict, From(single_case)]) -> str: - return echo_task(case["inputs"]) + def eval_echo(case: Annotated[EvalCase, From(single_case)]) -> str: + return echo_task(case.inputs) runner = TestRunner(session) runner.run() @@ -906,8 +907,8 @@ def expensive(ctx: EvalContext) -> bool: session.add_suite(eval_echo_suite) @eval_echo_suite.eval(evaluators=[ShortCircuit([cheap, expensive])]) - def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: - return echo_task(case["inputs"]) + def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str: + return echo_task(case.inputs) runner = TestRunner(session) runner.run() @@ -931,7 +932,7 @@ def check_b(ctx: EvalContext) -> bool: return True single = ForEach( - [{"inputs": "x", "expected": "x", "name": "c1"}], ids=lambda c: c["name"] + [EvalCase(inputs="x", expected="x", name="c1")], ids=lambda c: c.name ) session = EvalSession() @@ -939,8 +940,8 @@ def check_b(ctx: EvalContext) -> bool: session.add_suite(eval_echo_suite) @eval_echo_suite.eval(evaluators=[ShortCircuit([check_a, check_b])]) - def eval_echo(case: Annotated[dict, From(single)]) -> str: - return echo_task(case["inputs"]) + def eval_echo(case: Annotated[EvalCase, From(single)]) -> str: + return echo_task(case.inputs) runner = TestRunner(session) result = runner.run() @@ -967,8 +968,8 @@ def _run_eval(self, tmp_path: Path) -> Path: session.add_suite(eval_echo_suite) @eval_echo_suite.eval(evaluators=[fake_accuracy]) - def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: - return echo_task(case["inputs"]) + def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str: + return echo_task(case.inputs) runner = TestRunner(session) runner.run() @@ -1019,16 +1020,16 @@ class TestMultiDatasetHistory: def _run_multi(self, tmp_path: Path) -> dict[str, Any]: pipeline_cases = ForEach( [ - {"inputs": "hello", "expected": "hello", "name": "c1"}, + EvalCase(inputs="hello", expected="hello", name="c1"), ], - ids=lambda c: c["name"], + ids=lambda c: c.name, ) ingest_cases = ForEach( [ - {"inputs": "world", "expected": "world", "name": "c2"}, + EvalCase(inputs="world", expected="world", name="c2"), ], - ids=lambda c: c["name"], + ids=lambda c: c.name, ) session = EvalSession(history_dir=tmp_path) @@ -1037,15 +1038,15 @@ def _run_multi(self, tmp_path: Path) -> dict[str, Any]: session.add_suite(pipeline_suite) @pipeline_suite.eval(evaluators=[fake_accuracy]) - def pipeline(case: Annotated[dict, From(pipeline_cases)]) -> str: - return echo_task(case["inputs"]) + def pipeline(case: Annotated[EvalCase, From(pipeline_cases)]) -> str: + return echo_task(case.inputs) ingest_suite = EvalSuite("ingest") session.add_suite(ingest_suite) @ingest_suite.eval(evaluators=[fake_accuracy]) - def ingest(case: Annotated[dict, From(ingest_cases)]) -> str: - return echo_task(case["inputs"]) + def ingest(case: Annotated[EvalCase, From(ingest_cases)]) -> str: + return echo_task(case.inputs) run_session(session) @@ -1080,8 +1081,8 @@ def test_task_without_fixtures_still_works(self) -> None: session.add_suite(eval_echo_suite) @eval_echo_suite.eval(evaluators=[fake_accuracy]) - def eval_echo(case: Annotated[dict, From(basic_cases)]) -> str: - return echo_task(case["inputs"]) + def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str: + return echo_task(case.inputs) runner = TestRunner(session) result = runner.run() @@ -1096,9 +1097,9 @@ def prefix_service() -> str: single_case = ForEach( [ - {"inputs": "hello", "expected": "PREFIX:hello", "name": "c1"}, + EvalCase(inputs="hello", expected="PREFIX:hello", name="c1"), ], - ids=lambda c: c["name"], + ids=lambda c: c.name, ) session = EvalSession() @@ -1109,10 +1110,10 @@ def prefix_service() -> str: @eval_prefixed_suite.eval(evaluators=[fake_accuracy]) async def eval_prefixed( - case: Annotated[dict, From(single_case)], + case: Annotated[EvalCase, From(single_case)], svc: Annotated[str, Use(prefix_service)], ) -> str: - return f"{svc}:{case['inputs']}" + return f"{svc}:{case.inputs}" runner = TestRunner(session) result = runner.run() @@ -1132,11 +1133,11 @@ def expensive_resource() -> str: multi_cases = ForEach( [ - {"inputs": "a", "expected": "resource:a", "name": "c1"}, - {"inputs": "b", "expected": "resource:b", "name": "c2"}, - {"inputs": "c", "expected": "resource:c", "name": "c3"}, + EvalCase(inputs="a", expected="resource:a", name="c1"), + EvalCase(inputs="b", expected="resource:b", name="c2"), + EvalCase(inputs="c", expected="resource:c", name="c3"), ], - ids=lambda c: c["name"], + ids=lambda c: c.name, ) session = EvalSession() @@ -1147,10 +1148,10 @@ def expensive_resource() -> str: @eval_resource_suite.eval(evaluators=[fake_accuracy]) async def eval_resource( - case: Annotated[dict, From(multi_cases)], + case: Annotated[EvalCase, From(multi_cases)], res: Annotated[str, Use(expensive_resource)], ) -> str: - return f"{res}:{case['inputs']}" + return f"{res}:{case.inputs}" runner = TestRunner(session) runner.run() From d3f542cae0ef78e29e5a5b05d175339bf8609b09 Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Sat, 4 Apr 2026 07:48:20 +0200 Subject: [PATCH 22/60] refactor(evals): enhance docstrings for EvalSuite and EvalSession with detailed functionality descriptions --- protest/evals/session.py | 4 ++-- protest/evals/suite.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/protest/evals/session.py b/protest/evals/session.py index c85527a..419e9f6 100644 --- a/protest/evals/session.py +++ b/protest/evals/session.py @@ -1,4 +1,4 @@ -"""EvalSession — session dédiée aux evals.""" +"""EvalSession — eval-dedicated session with history and default propagation.""" from __future__ import annotations @@ -20,7 +20,7 @@ class EvalSession(ProTestSession): - """Session dédiée aux evals. + """Eval-dedicated session with history enabled by default. Usage:: diff --git a/protest/evals/suite.py b/protest/evals/suite.py index f0aba7e..905010c 100644 --- a/protest/evals/suite.py +++ b/protest/evals/suite.py @@ -1,4 +1,4 @@ -"""EvalSuite — suite dédiée aux evals.""" +"""EvalSuite — eval-dedicated suite with judge and model support.""" from __future__ import annotations @@ -17,7 +17,7 @@ class EvalSuite(ProTestSuite): - """Suite dédiée aux evals. + """Eval-dedicated suite that forces kind=EVAL and carries judge/model config. Usage:: From 6b3c203a9a823a3a82dcd13b030d4e14fc38279c Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Sat, 4 Apr 2026 15:03:35 +0200 Subject: [PATCH 23/60] feat(reporting): add eval suite and case payloads to web reporting - Added support for emitting an `EVAL_SUITE_END` event with detailed suite-level metrics and score statistics. - Extended `SUITE_END` payloads to include evaluation-related details when processing eval-specific results. --- protest/reporting/web.py | 53 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/protest/reporting/web.py b/protest/reporting/web.py index 517de24..5eb7119 100644 --- a/protest/reporting/web.py +++ b/protest/reporting/web.py @@ -28,6 +28,7 @@ TestStartInfo, TestTeardownInfo, ) + from protest.evals.types import EvalSuiteReport try: from websockets.asyncio.server import ( @@ -245,6 +246,33 @@ def on_fixture_teardown_done(self, info: FixtureInfo) -> None: {"name": info.name, "scope": info.scope, "duration": info.duration}, ) + def on_eval_suite_end(self, report: EvalSuiteReport) -> None: + self._send( + "EVAL_SUITE_END", + { + "suiteName": report.suite_name, + "totalCount": report.total_count, + "passedCount": report.passed_count, + "failedCount": report.failed_count, + "passRate": report.pass_rate, + "duration": report.duration, + "scoreStats": [ + { + "name": s.name, + "mean": s.mean, + "median": s.median, + "p5": s.p5, + "p95": s.p95, + } + for s in report.all_score_stats() + ], + "taskTokens": report.total_task_tokens, + "taskCost": report.total_task_cost, + "judgeTokens": report.total_judge_tokens, + "judgeCost": report.total_judge_cost, + }, + ) + def on_suite_end(self, result: SuiteResult) -> None: self._send( "SUITE_END", @@ -276,4 +304,29 @@ def _result_payload( if include_error and result.error: payload["message"] = str(result.error) payload["traceback"] = _format_traceback(result.error) + if result.is_eval and result.eval_payload: + ep = result.eval_payload + payload["evalPayload"] = { + "caseName": ep.case_name, + "passed": ep.passed, + "inputs": ep.inputs, + "output": ep.output, + "expected": ep.expected_output, + "scores": { + name: { + "value": entry.value, + "passed": entry.passed, + "skipped": entry.skipped, + } + for name, entry in ep.scores.items() + }, + "taskDuration": ep.task_duration, + "taskInputTokens": ep.task_input_tokens, + "taskOutputTokens": ep.task_output_tokens, + "taskCost": ep.task_cost, + "judgeCallCount": ep.judge_call_count, + "judgeInputTokens": ep.judge_input_tokens, + "judgeOutputTokens": ep.judge_output_tokens, + "judgeCost": ep.judge_cost, + } return payload From 924615f089e9cc5f2d85c87e5b52d0a5f97b0bcc Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Tue, 14 Apr 2026 06:34:00 +0200 Subject: [PATCH 24/60] refactor(evals): remove EvalSession, merge history plugins, always-on architecture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Delete EvalSession — ProTestSession is the only session - Merge HistoryPlugin + EvalHistoryPlugin into single always-on plugin - EvalResultsWriter now always-on (no-op without evals) - Model/judge live entirely on EvalSuite, no session propagation - history=True by default on ProTestSession - Remove apply_defaults, _wire_eval_support, add_suite override --- docs/evals.md | 53 +++----- examples/yorkshire/evals/session.py | 17 +-- protest/core/session.py | 12 +- protest/evals/evaluator.py | 2 +- protest/evals/history.py | 165 ------------------------ protest/evals/results_writer.py | 4 +- protest/evals/session.py | 96 -------------- protest/evals/types.py | 2 +- protest/history/plugin.py | 193 +++++++++++++++++++++++----- tests/evals/test_e2e.py | 67 +++++----- tests/evals/test_judge.py | 45 ++++--- 11 files changed, 249 insertions(+), 407 deletions(-) delete mode 100644 protest/evals/history.py delete mode 100644 protest/evals/session.py diff --git a/docs/evals.md b/docs/evals.md index cca70cc..387267f 100644 --- a/docs/evals.md +++ b/docs/evals.md @@ -14,10 +14,9 @@ ProTest evals use the same infrastructure as tests: fixtures, DI, parallelism, t # evals/session.py from typing import Annotated -from protest import ForEach, From +from protest import ForEach, From, ProTestSession from protest.evals import EvalCase, ModelInfo, evaluator from protest.evals.evaluators import contains_keywords -from protest.evals.session import EvalSession from protest.evals.suite import EvalSuite cases = ForEach([ @@ -25,9 +24,9 @@ cases = ForEach([ EvalCase(inputs="What is 2+2?", expected="4", name="math"), ]) -session = EvalSession(model=ModelInfo(name="gpt-4o-mini")) +session = ProTestSession() -chatbot_suite = EvalSuite("chatbot") +chatbot_suite = EvalSuite("chatbot", model=ModelInfo(name="gpt-4o-mini")) session.add_suite(chatbot_suite) @chatbot_suite.eval(evaluators=[contains_keywords(keywords=["Marie"])]) @@ -51,42 +50,22 @@ protest eval evals.session:session The rest of the pipeline — fixtures, DI, parallelism, reporters — works identically to tests. -## EvalSession - -`EvalSession` is a session configured for evals. History is enabled by default. Model and judge set on the session are propagated as defaults to `EvalSuite` instances added via `session.add_suite()`. - -```python -from protest.evals import ModelInfo -from protest.evals.session import EvalSession - -session = EvalSession( - model=ModelInfo(name="gpt-4o-mini"), # propagated to suites, tracked in history - concurrency=4, # parallel eval cases - metadata={"version": "1.0"}, # stored in history -) -``` - ## EvalSuite -`EvalSuite` groups eval cases. It's the eval equivalent of `ProTestSuite` — it forces `kind=EVAL` and carries model/judge configuration. +`EvalSuite` groups eval cases. It's the eval equivalent of `ProTestSuite` — it forces `kind=EVAL` and carries model/judge configuration. Model and judge are suite-level config: each suite declares which model produced its results and which judge scores them. ```python from protest.evals.suite import EvalSuite +from protest.evals import ModelInfo -chatbot_suite = EvalSuite("chatbot") -session.add_suite(chatbot_suite) # model/judge propagated from session +chatbot_suite = EvalSuite("chatbot", model=ModelInfo(name="gpt-4o-mini")) +session.add_suite(chatbot_suite) @chatbot_suite.eval(evaluators=[my_scorer]) async def chatbot(case: Annotated[EvalCase, From(cases)]) -> str: return await my_agent(case.inputs) ``` -Per-suite model override: - -```python -chatbot_suite = EvalSuite("chatbot", model=ModelInfo(name="mistral-7b")) -``` - ## EvalCase Typed dataclass for eval case data. Provides IDE autocompletion instead of untyped dicts. @@ -189,7 +168,7 @@ The threshold (`min_recall`) is a parameter of the evaluator, not a framework co ### Async (LLM Judge) -Use `ctx.judge()` for structured LLM evaluation (requires `judge=` on `EvalSession`): +Use `ctx.judge()` for structured LLM evaluation (requires `judge=` on `EvalSuite`): ```python @dataclass @@ -305,7 +284,7 @@ async def pipeline_eval( `ModelInfo` is a **label for history tracking** — it does not configure or route to any model. It records which model produced the results so you can compare runs. ```python -session = EvalSession(model=ModelInfo(name="qwen-2.5")) +suite = EvalSuite("pipeline", model=ModelInfo(name="qwen-2.5")) ``` ## Judge @@ -358,7 +337,8 @@ return JudgeResponse(output=result.output) # tokens/cost = None, that's fine ### Configuring the Judge ```python -session = EvalSession( +suite = EvalSuite( + "pipeline", model=ModelInfo(name="qwen-2.5"), judge=PydanticAIJudge(model="gpt-4o-mini", temperature=0), ) @@ -394,7 +374,7 @@ async def simple_judge(ctx: EvalContext) -> bool: ### No Judge Configured -If an evaluator calls `ctx.judge()` and no judge was passed to `EvalSession`, a `RuntimeError` is raised. This is treated as an **infrastructure error** (not a test failure), same as a fixture crash. +If an evaluator calls `ctx.judge()` and no judge was passed to `EvalSuite`, a `RuntimeError` is raised. This is treated as an **infrastructure error** (not a test failure), same as a fixture crash. ### Usage Tracking @@ -457,13 +437,10 @@ If two evaluators return dataclasses with the same field name (e.g. both have `a Track which model produced each eval suite's results. Each `EvalSuite` can have its own model: ```python -pipeline_model = ModelInfo(name="qwen-2.5") -chat_model = ModelInfo(name="mistral-7b") +session = ProTestSession() -session = EvalSession(model=pipeline_model) # default model - -pipeline_suite = EvalSuite("pipeline") # inherits pipeline_model from session -chatbot_suite = EvalSuite("chatbot", model=chat_model) # override +pipeline_suite = EvalSuite("pipeline", model=ModelInfo(name="qwen-2.5")) +chatbot_suite = EvalSuite("chatbot", model=ModelInfo(name="mistral-7b")) session.add_suite(pipeline_suite) session.add_suite(chatbot_suite) diff --git a/examples/yorkshire/evals/session.py b/examples/yorkshire/evals/session.py index f1800d8..f03d733 100644 --- a/examples/yorkshire/evals/session.py +++ b/examples/yorkshire/evals/session.py @@ -15,20 +15,21 @@ suite_evaluators, yorkshire_cases, ) -from protest import From -from protest.evals import ModelInfo -from protest.evals.session import EvalSession +from protest import From, ProTestSession +from protest.evals import EvalCase, ModelInfo from protest.evals.suite import EvalSuite -session = EvalSession( - model=ModelInfo(name="yorkshire-chatbot-v1", provider="local"), +session = ProTestSession( metadata={"version": "1.0", "type": "keyword-matching"}, ) -yorkshire_suite = EvalSuite("yorkshire_eval") +yorkshire_suite = EvalSuite( + "yorkshire_eval", + model=ModelInfo(name="yorkshire-chatbot-v1", provider="local"), +) session.add_suite(yorkshire_suite) @yorkshire_suite.eval(evaluators=suite_evaluators) -def yorkshire_eval(case: Annotated[dict, From(yorkshire_cases)]) -> str: - return yorkshire_chatbot(case["inputs"]) +def yorkshire_eval(case: Annotated[EvalCase, From(yorkshire_cases)]) -> str: + return yorkshire_chatbot(case.inputs) diff --git a/protest/core/session.py b/protest/core/session.py index daafa74..4b3d008 100644 --- a/protest/core/session.py +++ b/protest/core/session.py @@ -27,6 +27,7 @@ normalize_skip, normalize_xfail, ) +from protest.evals.results_writer import EvalResultsWriter from protest.events.bus import EventBus from protest.events.types import Event from protest.exceptions import InvalidMaxConcurrencyError @@ -34,6 +35,7 @@ from protest.filters.keyword import KeywordFilterPlugin from protest.filters.kind import KindFilterPlugin from protest.filters.suite import SuiteFilterPlugin +from protest.history.plugin import HistoryPlugin from protest.reporting.ascii import AsciiReporter from protest.reporting.ctrf import CTRFReporter from protest.reporting.log_file import LogFilePlugin @@ -59,7 +61,7 @@ class ProTestSession: def __init__( self, concurrency: int = 1, - history: bool = False, + history: bool = True, history_dir: Path | None = None, metadata: dict[str, Any] | None = None, ) -> None: @@ -268,6 +270,8 @@ def default_plugin_classes() -> list[type[PluginBase]]: SuiteFilterPlugin, KeywordFilterPlugin, KindFilterPlugin, + HistoryPlugin, + EvalResultsWriter, RichReporter, AsciiReporter, CTRFReporter, @@ -278,12 +282,6 @@ def register_default_plugins(self) -> None: """Register all standard ProTest plugins for CLI discovery.""" for plugin_class in self.default_plugin_classes(): self.use(plugin_class) - if self._history: - from protest.history.plugin import ( # noqa: PLC0415 — conditional - HistoryPlugin, - ) - - self.register_plugin(HistoryPlugin(history_dir=self._history_dir)) @property def plugin_classes(self) -> list[type[PluginBase]]: diff --git a/protest/evals/evaluator.py b/protest/evals/evaluator.py index 07dc2f2..6baab51 100644 --- a/protest/evals/evaluator.py +++ b/protest/evals/evaluator.py @@ -93,7 +93,7 @@ async def judge(self, prompt: str, output_type: type[T]) -> T: if self._judge is None: raise RuntimeError( f"Evaluator for case '{self.name}' called ctx.judge() but no " - "judge is configured. Pass judge= to EvalSession()." + "judge is configured. Pass judge= to EvalSuite()." ) self._judge_call_count += 1 response = await self._judge.judge(prompt, output_type) diff --git a/protest/evals/history.py b/protest/evals/history.py deleted file mode 100644 index 010ddb8..0000000 --- a/protest/evals/history.py +++ /dev/null @@ -1,165 +0,0 @@ -"""EvalHistoryPlugin — persists eval run results as JSONL with model/scores.""" - -from __future__ import annotations - -import uuid -from datetime import datetime, timezone -from typing import TYPE_CHECKING, Any - -from protest.entities import SuiteKind -from protest.history.collector import collect_env_info, collect_git_info -from protest.history.storage import ( - DEFAULT_HISTORY_DIR, - HISTORY_FILE, - append_entry, - load_history, - load_previous_run, -) -from protest.plugin import PluginBase - -if TYPE_CHECKING: - from pathlib import Path - - from protest.core.session import ProTestSession - from protest.evals.types import EvalCaseResult, EvalSuiteReport, ModelInfo - from protest.plugin import PluginContext - - -class EvalHistoryPlugin(PluginBase): - """Persists eval results to JSONL with model/judge/scores metadata. - - Listens to EVAL_SUITE_END events (emitted by the core runner). - """ - - name = "eval-history" - description = "Eval history tracking" - - def __init__( - self, - *, - history_dir: Path | None = None, - model: ModelInfo | None = None, - judge: dict[str, Any] | None = None, - metadata: dict[str, Any] | None = None, - ) -> None: - self._history_dir = history_dir or DEFAULT_HISTORY_DIR - self._history_file = self._history_dir / HISTORY_FILE - self._model = model - self._judge = judge - self._metadata = dict(metadata) if metadata else {} - self._reports: dict[str, EvalSuiteReport] = {} - - _suite_metadata: dict[str, dict[str, Any]] - - @classmethod - def activate(cls, ctx: PluginContext) -> EvalHistoryPlugin | None: - return None # Wired explicitly by session - - def setup(self, session: ProTestSession) -> None: - """Collect per-suite metadata from session.""" - self._suite_metadata = {} - for suite in session.suites: - if suite.kind == SuiteKind.EVAL: - self._suite_metadata[suite.name] = suite.suite_metadata - - def on_eval_suite_end(self, report: EvalSuiteReport) -> None: - """Collect suite reports as they arrive.""" - self._reports[report.suite_name] = report - - def on_session_end(self, _result: Any) -> None: - """Write all collected reports to history.""" - if not self._reports: - return - entry = _build_entry( - self._reports, - self._model, - self._judge, - self._metadata, - self._suite_metadata, - ) - append_entry(self._history_file, entry) - - def load_entries(self, n: int | None = None) -> list[dict[str, Any]]: - """Load entries from history file.""" - return load_history(history_dir=self._history_dir, n=n, evals_only=True) - - -def _build_entry( - reports: dict[str, EvalSuiteReport], - model: ModelInfo | None, - judge: dict[str, Any] | None, - metadata: dict[str, Any] | None = None, - all_suite_metadata: dict[str, dict[str, Any]] | None = None, -) -> dict[str, Any]: - """Build a complete history entry covering all suites in the session.""" - suites_data: dict[str, Any] = {} - all_score_stats: list[Any] = [] - - for suite_name, report in reports.items(): - sm = (all_suite_metadata or {}).get(suite_name, {}) - suite_model = sm.get("model") or (model.name if model else None) - suite_provider = sm.get("provider") or (model.provider if model else None) - suites_data[suite_name] = { - "kind": "eval", - "model": suite_model, - "provider": suite_provider, - "total_cases": report.total_count, - "passed": report.passed_count, - "failed": report.failed_count, - "errored": report.errored_count, - "pass_rate": round(report.pass_rate, 4), - "duration": round(report.duration, 2), - "cases": {c.case_name: _serialize_case(c) for c in report.cases}, - } - all_score_stats.extend(report.all_score_stats()) - - scores_summary = { - s.name: { - "mean": round(s.mean, 4), - "median": round(s.median, 4), - "p5": round(s.p5, 4), - "p95": round(s.p95, 4), - "min": round(s.min, 4), - "max": round(s.max, 4), - "count": s.count, - } - for s in all_score_stats - } - - return { - "run_id": str(uuid.uuid4()), - "timestamp": datetime.now(tz=timezone.utc).isoformat(), - "git": collect_git_info(), - "environment": collect_env_info(), - "metadata": dict(metadata) if metadata else {}, - "evals": { - "model": model.name if model else None, - "provider": model.provider if model else None, - "judge": judge, - "scores_summary": scores_summary, - }, - "suites": suites_data, - } - - -def _serialize_case(case: EvalCaseResult) -> dict[str, Any]: - entry: dict[str, Any] = { - "passed": case.passed, - "is_error": case.is_error, - "duration": round(case.duration, 3), - "scores": {s.name: s.value for s in case.scores if s.is_metric}, - "case_hash": case.case_hash, - "eval_hash": case.eval_hash, - } - labels = {s.name: s.value for s in case.scores if isinstance(s.value, str)} - if labels: - entry["labels"] = labels - assertions = {s.name: s.value for s in case.scores if isinstance(s.value, bool)} - if assertions: - entry["assertions"] = assertions - return entry - - -def load_previous_eval_run(history_dir: Any = None) -> dict[str, Any] | None: - """Load the most recent eval run from history.""" - return load_previous_run(history_dir=history_dir, evals_only=True) diff --git a/protest/evals/results_writer.py b/protest/evals/results_writer.py index e069bba..67ca569 100644 --- a/protest/evals/results_writer.py +++ b/protest/evals/results_writer.py @@ -34,8 +34,8 @@ def __init__(self, history_dir: Path | None = None) -> None: self._run_dirs: dict[str, Path] = {} @classmethod - def activate(cls, ctx: PluginContext) -> EvalResultsWriter | None: - return None # Wired explicitly by session + def activate(cls, ctx: PluginContext) -> EvalResultsWriter: + return cls(history_dir=ctx.get("history_dir")) def on_test_pass(self, result: TestResult) -> None: self._maybe_write(result, passed=True) diff --git a/protest/evals/session.py b/protest/evals/session.py deleted file mode 100644 index 419e9f6..0000000 --- a/protest/evals/session.py +++ /dev/null @@ -1,96 +0,0 @@ -"""EvalSession — eval-dedicated session with history and default propagation.""" - -from __future__ import annotations - -from typing import TYPE_CHECKING, Any - -from protest.core.session import ProTestSession -from protest.entities import SuiteKind -from protest.evals.history import EvalHistoryPlugin -from protest.evals.results_writer import EvalResultsWriter -from protest.evals.suite import EvalSuite -from protest.evals.types import JudgeInfo - -if TYPE_CHECKING: - from pathlib import Path - - from protest.core.suite import ProTestSuite - from protest.evals.types import Judge, ModelInfo - from protest.plugin import PluginContext - - -class EvalSession(ProTestSession): - """Eval-dedicated session with history enabled by default. - - Usage:: - - session = EvalSession(model=ModelInfo(name="qwen-2.5")) - - chatbot = EvalSuite("chatbot") - session.add_suite(chatbot) - - @chatbot.eval(evaluators=[contains_facts]) - async def chatbot(case: Annotated[EvalCase, From(cases)]) -> str: - return await ask(case.inputs) - """ - - def __init__( - self, - *, - model: ModelInfo | None = None, - judge: Judge | None = None, - concurrency: int = 1, - history: bool = True, - history_dir: Path | None = None, - metadata: dict[str, Any] | None = None, - ) -> None: - super().__init__( - concurrency=concurrency, - history=history, - history_dir=history_dir, - metadata=metadata, - ) - self._eval_model = model - self._eval_judge_instance: Judge | None = judge - self._eval_judge: JudgeInfo | None = ( - JudgeInfo(name=judge.name, provider=judge.provider) - if judge is not None - else None - ) - - def add_suite(self, suite: ProTestSuite) -> None: - """Add a suite, propagating session-level model/judge as defaults.""" - if isinstance(suite, EvalSuite): - if suite.judge is None and self._eval_judge_instance is not None: - suite._judge = self._eval_judge_instance - if self._eval_model and "model" not in suite.suite_metadata: - suite._metadata["model"] = self._eval_model.name - suite._metadata["provider"] = self._eval_model.provider - super().add_suite(suite) - - def activate_plugins(self, ctx: PluginContext) -> None: - """Activate plugins, then wire eval support if needed.""" - super().activate_plugins(ctx) - if any(s.kind == SuiteKind.EVAL for s in self._suites): - self._wire_eval_support() - - def _wire_eval_support(self) -> None: - """Wire eval history + results writer plugins.""" - judge_dict = None - if self._eval_judge: - judge_dict = { - "name": self._eval_judge.name, - "provider": self._eval_judge.provider, - "evaluators": list(self._eval_judge.evaluators), - } - - history = EvalHistoryPlugin( - history_dir=self._history_dir, - model=self._eval_model, - judge=judge_dict, - metadata=self._metadata, - ) - self.register_plugin(history) - - writer = EvalResultsWriter(history_dir=self._history_dir) - self.register_plugin(writer) diff --git a/protest/evals/types.py b/protest/evals/types.py index 08543c6..7a2c19a 100644 --- a/protest/evals/types.py +++ b/protest/evals/types.py @@ -82,7 +82,7 @@ async def judge(self, prompt: str, output_type: type[T]) -> JudgeResponse[T]: result = await agent.run(prompt) return JudgeResponse(output=result.output, input_tokens=100) - session = EvalSession(judge=MyJudge()) + suite = EvalSuite("chatbot", judge=MyJudge()) """ name: str diff --git a/protest/history/plugin.py b/protest/history/plugin.py index c8a0f79..930ca61 100644 --- a/protest/history/plugin.py +++ b/protest/history/plugin.py @@ -1,4 +1,4 @@ -"""HistoryPlugin — persists test run results as JSONL.""" +"""HistoryPlugin — persists test and eval run results as JSONL.""" from __future__ import annotations @@ -6,66 +6,124 @@ from datetime import datetime, timezone from typing import TYPE_CHECKING, Any +from protest.entities import SuiteKind +from protest.evals.suite import EvalSuite from protest.history.collector import collect_env_info, collect_git_info -from protest.history.storage import DEFAULT_HISTORY_DIR, HISTORY_FILE, append_entry +from protest.history.storage import ( + DEFAULT_HISTORY_DIR, + HISTORY_FILE, + append_entry, + load_previous_run, +) from protest.plugin import PluginBase if TYPE_CHECKING: from pathlib import Path from protest.core.session import ProTestSession - from protest.entities import SuiteKind - from protest.entities.events import SessionResult, TestResult + from protest.entities.events import TestResult + from protest.evals.types import EvalCaseResult, EvalSuiteReport from protest.plugin import PluginContext class HistoryPlugin(PluginBase): - """Persists test results to JSONL for run-over-run tracking.""" + """Persists test and eval results to JSONL for run-over-run tracking. + + Always-on plugin. When history is disabled on the session, all handlers + are no-ops. Handles both test results (on_test_pass/fail) and eval + results (on_eval_suite_end). + """ name = "history" - description = "Test history tracking" + description = "Run history tracking" def __init__(self, history_dir: Path | None = None) -> None: self._history_dir = history_dir or DEFAULT_HISTORY_DIR self._history_file = self._history_dir / HISTORY_FILE - self._suites: dict[str, dict[str, dict[str, Any]]] = {} + # Test data + self._test_suites: dict[str, dict[str, dict[str, Any]]] = {} self._suite_kinds: dict[str, SuiteKind] = {} self._default_suite_name: str = "tests" - self._history_enabled: bool = False + # Eval data + self._eval_reports: dict[str, EvalSuiteReport] = {} + self._eval_suite_metadata: dict[str, dict[str, Any]] = {} + self._eval_judge_info: dict[str, dict[str, Any]] = {} + # Session state + self._enabled: bool = False self._metadata: dict[str, Any] = {} @classmethod def activate(cls, ctx: PluginContext) -> HistoryPlugin | None: - return None # Wired explicitly by session + if ctx.get("no_history", False): + return None + return cls(history_dir=ctx.get("history_dir")) def setup(self, session: ProTestSession) -> None: - self._history_enabled = session.history + self._enabled = session.history self._metadata = dict(session.metadata) + if session.history_dir: + self._history_dir = session.history_dir + self._history_file = self._history_dir / HISTORY_FILE for suite in session.suites: self._suite_kinds[suite.name] = suite.kind - if not self._default_suite_name or self._default_suite_name == "tests": + if suite.kind == SuiteKind.EVAL: + self._eval_suite_metadata[suite.name] = suite.suite_metadata + if isinstance(suite, EvalSuite) and suite.judge is not None: + self._eval_judge_info[suite.name] = { + "name": suite.judge.name, + "provider": getattr(suite.judge, "provider", None), + } + elif not self._default_suite_name or self._default_suite_name == "tests": self._default_suite_name = suite.name + # -- Test event handlers -------------------------------------------------- + def on_test_pass(self, result: TestResult) -> None: - if result.is_eval: + if not self._enabled or result.is_eval: return - self._record(result, passed=True) + self._record_test(result, passed=True) def on_test_fail(self, result: TestResult) -> None: - if result.is_eval: + if not self._enabled or result.is_eval: return - self._record(result, passed=False) + self._record_test(result, passed=False) + + def _record_test(self, result: TestResult, *, passed: bool) -> None: + suite_name = ( + result.suite_path.root_name + if result.suite_path + else self._default_suite_name + ) + if suite_name not in self._test_suites: + self._test_suites[suite_name] = {} + self._test_suites[suite_name][result.name] = { + "passed": passed, + "duration": round(result.duration, 3), + } + + # -- Eval event handlers -------------------------------------------------- - def on_session_end(self, _result: SessionResult) -> None: - if not self._history_enabled or not self._suites: + def on_eval_suite_end(self, report: EvalSuiteReport) -> None: + if not self._enabled: + return + self._eval_reports[report.suite_name] = report + + # -- Session end: write combined entry ------------------------------------ + + def on_session_end(self, result: Any) -> None: + if not self._enabled: + return + if not self._test_suites and not self._eval_reports: return suites_data: dict[str, Any] = {} - for suite_name, cases in self._suites.items(): + + # Test suites + for suite_name, cases in self._test_suites.items(): total = len(cases) passed = sum(1 for c in cases.values() if c["passed"]) suites_data[suite_name] = { - "kind": self._suite_kinds.get(suite_name, "test"), + "kind": str(self._suite_kinds.get(suite_name, "test")), "total_cases": total, "passed": passed, "failed": total - passed, @@ -74,27 +132,98 @@ def on_session_end(self, _result: SessionResult) -> None: "cases": cases, } + # Eval suites + all_score_stats: list[Any] = [] + for suite_name, report in self._eval_reports.items(): + sm = self._eval_suite_metadata.get(suite_name, {}) + suites_data[suite_name] = { + "kind": "eval", + "model": sm.get("model"), + "provider": sm.get("provider"), + "total_cases": report.total_count, + "passed": report.passed_count, + "failed": report.failed_count, + "errored": report.errored_count, + "pass_rate": round(report.pass_rate, 4), + "duration": round(report.duration, 2), + "cases": {c.case_name: _serialize_eval_case(c) for c in report.cases}, + } + all_score_stats.extend(report.all_score_stats()) + + # Build evals summary (non-null only if we have eval data) + evals_summary = None + if self._eval_reports: + # Derive top-level model from first eval suite (or None if mixed) + models = { + sm.get("model") + for sm in self._eval_suite_metadata.values() + if sm.get("model") + } + top_model = models.pop() if len(models) == 1 else None + providers = { + sm.get("provider") + for sm in self._eval_suite_metadata.values() + if sm.get("provider") + } + top_provider = providers.pop() if len(providers) == 1 else None + + # Aggregate judge info (first one found, or None) + judge_dict = None + if self._eval_judge_info: + first_judge = next(iter(self._eval_judge_info.values())) + judge_dict = first_judge + + scores_summary = { + s.name: { + "mean": round(s.mean, 4), + "median": round(s.median, 4), + "p5": round(s.p5, 4), + "p95": round(s.p95, 4), + "min": round(s.min, 4), + "max": round(s.max, 4), + "count": s.count, + } + for s in all_score_stats + } + + evals_summary = { + "model": top_model, + "provider": top_provider, + "judge": judge_dict, + "scores_summary": scores_summary, + } + entry: dict[str, Any] = { "run_id": str(uuid.uuid4()), "timestamp": datetime.now(tz=timezone.utc).isoformat(), "git": collect_git_info(), "environment": collect_env_info(), "metadata": self._metadata, - "evals": None, + "evals": evals_summary, "suites": suites_data, } append_entry(self._history_file, entry) - def _record(self, result: TestResult, *, passed: bool) -> None: - suite_name = self._get_suite_name(result) - if suite_name not in self._suites: - self._suites[suite_name] = {} - self._suites[suite_name][result.name] = { - "passed": passed, - "duration": round(result.duration, 3), - } - def _get_suite_name(self, result: TestResult) -> str: - if result.suite_path: - return result.suite_path.root_name - return self._default_suite_name +def _serialize_eval_case(case: EvalCaseResult) -> dict[str, Any]: + """Serialize an eval case result for JSONL storage.""" + entry: dict[str, Any] = { + "passed": case.passed, + "is_error": case.is_error, + "duration": round(case.duration, 3), + "scores": {s.name: s.value for s in case.scores if s.is_metric}, + "case_hash": case.case_hash, + "eval_hash": case.eval_hash, + } + labels = {s.name: s.value for s in case.scores if isinstance(s.value, str)} + if labels: + entry["labels"] = labels + assertions = {s.name: s.value for s in case.scores if isinstance(s.value, bool)} + if assertions: + entry["assertions"] = assertions + return entry + + +def load_previous_eval_run(history_dir: Any = None) -> dict[str, Any] | None: + """Load the most recent eval run from history.""" + return load_previous_run(history_dir=history_dir, evals_only=True) diff --git a/tests/evals/test_e2e.py b/tests/evals/test_e2e.py index fc35686..f1fc5d1 100644 --- a/tests/evals/test_e2e.py +++ b/tests/evals/test_e2e.py @@ -1,7 +1,7 @@ """End-to-end tests for ProTest evals integration. These tests define the PUBLIC API contract. They test what the user sees: -- Session setup (EvalSession, EvalSuite + @suite.eval with ForEach/From) +- Session setup (ProTestSession, EvalSuite + @suite.eval with ForEach/From) - CLI behavior (protest run vs protest eval) - Output format (scores table, trends, failure messages) - History (JSONL format, stats, significance, clean-dirty) @@ -46,7 +46,6 @@ ) from protest.evals.hashing import compute_case_hash, compute_eval_hash from protest.evals.results_writer import EvalResultsWriter -from protest.evals.session import EvalSession from protest.evals.suite import EvalSuite from protest.evals.types import EvalSuiteReport # noqa: TC001 — used at runtime from protest.filters.kind import KindFilterPlugin @@ -104,11 +103,11 @@ async def async_echo_task(text: str) -> str: # --------------------------------------------------------------------------- -class TestEvalSession: - """EvalSession setup: constructor with model=, EvalSuite + @suite.eval.""" +class TestEvalSetup: + """Eval setup: ProTestSession + EvalSuite with model=, @suite.eval.""" def test_add_eval_creates_eval_kind(self) -> None: - session = EvalSession() + session = ProTestSession() eval_echo_suite = EvalSuite("eval_echo") session.add_suite(eval_echo_suite) @@ -121,18 +120,18 @@ def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str: assert len(session._suites) > 0 assert any(s.kind == "eval" for s in session._suites) - def test_model_set_via_constructor(self) -> None: - session = EvalSession(model=ModelInfo(name="test-model")) - assert session._eval_model is not None - assert session._eval_model.name == "test-model" + def test_model_set_via_suite(self) -> None: + suite = EvalSuite("eval_echo", model=ModelInfo(name="test-model")) + assert suite._model is not None + assert suite._model.name == "test-model" def test_metadata_on_constructor(self) -> None: - session = EvalSession(metadata={"env": "test"}) + session = ProTestSession(metadata={"env": "test"}) assert session.metadata["env"] == "test" def test_eval_with_bool_verdict(self) -> None: """Evaluator with bool field: case_fail has matches_expected=False -> fail.""" - session = EvalSession() + session = ProTestSession() eval_echo_suite = EvalSuite("eval_echo") session.add_suite(eval_echo_suite) @@ -148,7 +147,7 @@ def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str: assert result.success is False def test_async_task_works(self) -> None: - session = EvalSession() + session = ProTestSession() eval_echo_suite = EvalSuite("eval_echo") session.add_suite(eval_echo_suite) @@ -169,7 +168,7 @@ def test_async_evaluator_does_not_crash(self) -> None: ids=lambda c: c.name, ) - session = EvalSession() + session = ProTestSession() eval_echo_suite = EvalSuite("eval_echo") session.add_suite(eval_echo_suite) @@ -196,7 +195,7 @@ def test_test_suite_has_kind_test(self) -> None: assert suite.kind == "test" def test_eval_suite_has_kind_eval(self) -> None: - session = EvalSession() + session = ProTestSession() eval_echo_suite = EvalSuite("eval_echo") session.add_suite(eval_echo_suite) @@ -306,7 +305,7 @@ class ReportCapture(PluginBase): def on_eval_suite_end(self, report: Any) -> None: reports.append(report) - session = EvalSession() + session = ProTestSession() session.register_plugin(ReportCapture()) eval_echo_suite = EvalSuite("eval_echo") @@ -334,7 +333,7 @@ class ReportCapture(PluginBase): def on_eval_suite_end(self, report: Any) -> None: reports.append(report) - session = EvalSession() + session = ProTestSession() session.register_plugin(ReportCapture()) eval_echo_suite = EvalSuite("eval_echo") @@ -361,7 +360,7 @@ def on_test_fail(self, result: Any) -> None: if result.error: errors.append(str(result.error)) - session = EvalSession() + session = ProTestSession() session.register_plugin(ErrorCollector()) eval_echo_suite = EvalSuite("eval_echo") @@ -397,7 +396,7 @@ def on_test_pass(self, result: Any) -> None: def on_test_fail(self, result: Any) -> None: collected.append(result) - session = EvalSession() + session = ProTestSession() session.register_plugin(Collector()) eval_echo_suite = EvalSuite("eval_echo") @@ -432,7 +431,7 @@ def on_test_setup_done(self, info: Any) -> None: def on_test_teardown_start(self, info: Any) -> None: teardown_ids.append(info.node_id) - session = EvalSession() + session = ProTestSession() session.register_plugin(LifecycleCollector()) eval_echo_suite = EvalSuite("eval_echo") @@ -472,7 +471,7 @@ def crashing_evaluator(ctx: EvalContext) -> bool: ids=lambda c: c.name, ) - session = EvalSession() + session = ProTestSession() session.register_plugin(Collector()) eval_echo_suite = EvalSuite("eval_echo") @@ -522,9 +521,9 @@ class TestHistory: """JSONL history format and querying.""" def _run_eval(self, tmp_path: Path) -> None: - session = EvalSession(model=ModelInfo(name="test-model"), history_dir=tmp_path) + session = ProTestSession(history_dir=tmp_path) - eval_echo_suite = EvalSuite("eval_echo") + eval_echo_suite = EvalSuite("eval_echo", model=ModelInfo(name="test-model")) session.add_suite(eval_echo_suite) @eval_echo_suite.eval(evaluators=[fake_accuracy]) @@ -585,7 +584,7 @@ def test_history_multiple_runs_append(self, tmp_path: Path) -> None: assert len(lines) == 2 def test_history_metadata_included(self, tmp_path: Path) -> None: - session = EvalSession( + session = ProTestSession( history_dir=tmp_path, metadata={"env": "test", "version": "1.0"}, ) @@ -655,7 +654,7 @@ class TestCaseHashing: def test_case_hash_stored_in_history(self, tmp_path: Path) -> None: """History entries include case_hash and eval_hash per case.""" - session = EvalSession(history_dir=tmp_path) + session = ProTestSession(history_dir=tmp_path) eval_echo_suite = EvalSuite("eval_echo") session.add_suite(eval_echo_suite) @@ -796,7 +795,7 @@ def on_test_fail(self, result: Any) -> None: ids=lambda c: c.name, ) - session = EvalSession() + session = ProTestSession() session.register_plugin(Collector()) eval_echo_suite = EvalSuite("eval_echo") @@ -833,7 +832,7 @@ def on_test_fail(self, result: Any) -> None: ids=lambda c: c.name, ) - session = EvalSession() + session = ProTestSession() session.register_plugin(Collector()) eval_echo_suite = EvalSuite("eval_echo") @@ -868,7 +867,7 @@ def bad_evaluator(ctx: EvalContext) -> float: ids=lambda c: c.name, ) - session = EvalSession() + session = ProTestSession() session.register_plugin(Collector()) eval_echo_suite = EvalSuite("eval_echo") @@ -901,7 +900,7 @@ def expensive(ctx: EvalContext) -> bool: call_log.append("expensive") return True - session = EvalSession() + session = ProTestSession() eval_echo_suite = EvalSuite("eval_echo") session.add_suite(eval_echo_suite) @@ -934,7 +933,7 @@ def check_b(ctx: EvalContext) -> bool: single = ForEach( [EvalCase(inputs="x", expected="x", name="c1")], ids=lambda c: c.name ) - session = EvalSession() + session = ProTestSession() eval_echo_suite = EvalSuite("eval_echo") session.add_suite(eval_echo_suite) @@ -960,7 +959,7 @@ class TestResultsFiles: def _run_eval(self, tmp_path: Path) -> Path: results_dir = tmp_path / "results" - session = EvalSession() + session = ProTestSession() writer = EvalResultsWriter(history_dir=tmp_path) session.register_plugin(writer) @@ -1032,7 +1031,7 @@ def _run_multi(self, tmp_path: Path) -> dict[str, Any]: ids=lambda c: c.name, ) - session = EvalSession(history_dir=tmp_path) + session = ProTestSession(history_dir=tmp_path) pipeline_suite = EvalSuite("pipeline") session.add_suite(pipeline_suite) @@ -1075,7 +1074,7 @@ class TestEvalTaskFixtures: def test_task_without_fixtures_still_works(self) -> None: # basic_cases has one match (case_pass) and one mismatch (case_fail) # fake_accuracy returns matches_expected=False for case_fail -> fail - session = EvalSession() + session = ProTestSession() eval_echo_suite = EvalSuite("eval_echo") session.add_suite(eval_echo_suite) @@ -1102,7 +1101,7 @@ def prefix_service() -> str: ids=lambda c: c.name, ) - session = EvalSession() + session = ProTestSession() session.bind(prefix_service) eval_prefixed_suite = EvalSuite("eval_prefixed") @@ -1140,7 +1139,7 @@ def expensive_resource() -> str: ids=lambda c: c.name, ) - session = EvalSession() + session = ProTestSession() session.bind(expensive_resource) eval_resource_suite = EvalSuite("eval_resource") diff --git a/tests/evals/test_judge.py b/tests/evals/test_judge.py index 7eece34..e711bdb 100644 --- a/tests/evals/test_judge.py +++ b/tests/evals/test_judge.py @@ -7,7 +7,7 @@ import pytest -from protest import ForEach, From +from protest import ForEach, From, ProTestSession from protest.core.runner import TestRunner from protest.evals import ( EvalContext, @@ -17,7 +17,6 @@ Verdict, evaluator, ) -from protest.evals.session import EvalSession from protest.evals.suite import EvalSuite from protest.plugin import PluginBase @@ -210,7 +209,7 @@ async def test_judge_none_tokens_not_accumulated(self) -> None: # --------------------------------------------------------------------------- -# E2E: EvalSession with judge +# E2E: ProTestSession with judge on EvalSuite # --------------------------------------------------------------------------- single_case = ForEach( @@ -227,8 +226,8 @@ def test_judge_available_in_evaluator(self) -> None: async def judge_evaluator(ctx: EvalContext) -> bool: return await ctx.judge("pass this", bool) - session = EvalSession(judge=FakeJudge()) - eval_echo_suite = EvalSuite("eval_echo") + session = ProTestSession() + eval_echo_suite = EvalSuite("eval_echo", judge=FakeJudge()) session.add_suite(eval_echo_suite) @eval_echo_suite.eval(evaluators=[judge_evaluator]) @@ -246,8 +245,8 @@ def test_no_judge_is_fixture_error(self) -> None: async def needs_judge(ctx: EvalContext) -> bool: return await ctx.judge("test", bool) - session = EvalSession() # no judge - eval_echo_suite = EvalSuite("eval_echo") + session = ProTestSession() + eval_echo_suite = EvalSuite("eval_echo") # no judge session.add_suite(eval_echo_suite) @eval_echo_suite.eval(evaluators=[needs_judge]) @@ -278,8 +277,8 @@ async def double_judge(ctx: EvalContext) -> bool: r2 = await ctx.judge("pass second", bool) return r1 and r2 - session = EvalSession(judge=FakeJudge()) - eval_echo_suite = EvalSuite("eval_echo") + session = ProTestSession() + eval_echo_suite = EvalSuite("eval_echo", judge=FakeJudge()) session.add_suite(eval_echo_suite) @eval_echo_suite.eval(evaluators=[double_judge]) @@ -305,17 +304,17 @@ def on_test_pass(self, result: Any) -> None: assert payload.judge_output_tokens == 10 # 5 per call x 2 assert payload.judge_cost == pytest.approx(0.002) # 0.001 per call x 2 - def test_judge_info_derived_from_instance(self) -> None: - """EvalSession derives JudgeInfo from Judge instance.""" - session = EvalSession(judge=FakeJudge()) - assert session._eval_judge is not None - assert session._eval_judge.name == "fake-judge" - assert session._eval_judge.provider == "test" + def test_judge_info_derived_from_suite(self) -> None: + """EvalSuite derives JudgeInfo from Judge instance.""" + suite = EvalSuite("eval_echo", judge=FakeJudge()) + assert suite._judge is not None + assert suite._judge.name == "fake-judge" + assert suite._judge.provider == "test" def test_no_judge_no_judge_info(self) -> None: - """EvalSession without judge has no JudgeInfo.""" - session = EvalSession() - assert session._eval_judge is None + """EvalSuite without judge has no JudgeInfo.""" + suite = EvalSuite("eval_echo") + assert suite._judge is None def test_judge_with_structured_output(self) -> None: """Judge returns structured dataclass via output_type.""" @@ -335,8 +334,8 @@ async def judge(self, prompt: str, output_type: type) -> JudgeResponse: async def struct_evaluator(ctx: EvalContext) -> JudgeVerdict: return await ctx.judge("evaluate this", JudgeVerdict) - session = EvalSession(judge=StructuredJudge()) - eval_echo_suite = EvalSuite("eval_echo") + session = ProTestSession() + eval_echo_suite = EvalSuite("eval_echo", judge=StructuredJudge()) session.add_suite(eval_echo_suite) @eval_echo_suite.eval(evaluators=[struct_evaluator]) @@ -361,7 +360,7 @@ def test_task_result_unwrapped_for_evaluators(self) -> None: def check_output(ctx: EvalContext) -> bool: return ctx.output == "hello" # sees str, not TaskResult - session = EvalSession() + session = ProTestSession() eval_echo_suite = EvalSuite("eval_echo") session.add_suite(eval_echo_suite) @@ -385,7 +384,7 @@ def test_task_usage_in_payload(self) -> None: def always_pass(ctx: EvalContext) -> bool: return True - session = EvalSession() + session = ProTestSession() eval_echo_suite = EvalSuite("eval_echo") session.add_suite(eval_echo_suite) @@ -423,7 +422,7 @@ def test_plain_return_has_zero_task_usage(self) -> None: def always_pass(ctx: EvalContext) -> bool: return True - session = EvalSession() + session = ProTestSession() eval_echo_suite = EvalSuite("eval_echo") session.add_suite(eval_echo_suite) From 9c5830230ecd1bcdaad4ec0c116fff7457305320 Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Tue, 14 Apr 2026 20:33:31 +0200 Subject: [PATCH 25/60] refactor(evals): replace evaluator function wrapper with `Evaluator` class - Introduced `Evaluator --- protest/evals/__init__.py | 2 + protest/evals/evaluator.py | 75 ++++++++++++++++++++++---------------- protest/evals/hashing.py | 16 ++++++-- protest/evals/wrapper.py | 11 ++++-- protest/history/plugin.py | 2 +- 5 files changed, 67 insertions(+), 39 deletions(-) diff --git a/protest/evals/__init__.py b/protest/evals/__init__.py index 8584eff..c985114 100644 --- a/protest/evals/__init__.py +++ b/protest/evals/__init__.py @@ -3,6 +3,7 @@ from protest.evals.evaluator import ( EvalCase, EvalContext, + Evaluator, Metric, Reason, ShortCircuit, @@ -27,6 +28,7 @@ "EvalContext", "EvalScore", "EvalSuiteReport", + "Evaluator", "Judge", "JudgeInfo", "JudgeResponse", diff --git a/protest/evals/evaluator.py b/protest/evals/evaluator.py index 6baab51..569ce83 100644 --- a/protest/evals/evaluator.py +++ b/protest/evals/evaluator.py @@ -32,8 +32,6 @@ async def llm_judge(ctx: EvalContext, model: str = "haiku") -> bool: from __future__ import annotations import dataclasses -import functools -import inspect from dataclasses import dataclass, field from typing import ( TYPE_CHECKING, @@ -50,6 +48,8 @@ async def llm_judge(ctx: EvalContext, model: str = "haiku") -> bool: from protest.evals.types import EvalScore if TYPE_CHECKING: + from collections.abc import Callable + from protest.evals.types import Judge InputT = TypeVar("InputT") @@ -217,37 +217,50 @@ def extract_scores_from_result(result: Any, evaluator_name: str) -> list[Any]: raise TypeError(f"Evaluator must return bool or dataclass, got {type_name}") -def evaluator(fn: Any) -> Any: - """Decorator that turns a function into a protest evaluator. - - The decorated function can be called two ways: +class Evaluator: + """A configured evaluator — callable with identity for hashing. - 1. ``evaluator_fn(ctx)`` — evaluate directly - 2. ``evaluator_fn(keyword=value, ...)`` — returns a bound evaluator (partial) + Created by the ``@evaluator`` decorator. Supports two calling modes: - This is just ``functools.partial`` with nicer ergonomics: when the first - positional argument is an ``EvalContext``, the function evaluates. Otherwise, - all arguments are bound and the result is a new callable expecting only ``ctx``. + 1. ``ev(ctx)`` — evaluate directly (first arg is EvalContext) + 2. ``ev(keyword=value, ...)`` — bind params, return a new Evaluator """ - sig = inspect.signature(fn) - params = list(sig.parameters.values()) - has_extra_params = len(params) > 1 - @functools.wraps(fn) - def wrapper(*args: Any, **kwargs: Any) -> Any: - # Direct call: first positional arg is an EvalContext + __slots__ = ("_fn", "_kwargs", "_name", "_qualname") + + def __init__( + self, fn: Callable[..., Any], kwargs: dict[str, Any] | None = None + ) -> None: + self._fn = fn + self._kwargs = kwargs or {} + self._name = fn.__name__ + self._qualname = fn.__qualname__ + + @property + def name(self) -> str: + return self._name + + def __call__(self, *args: Any, **kwargs: Any) -> Any: if args and isinstance(args[0], EvalContext): - return fn(*args, **kwargs) - # Bind params → return partial - if has_extra_params and kwargs: - bound = functools.partial(fn, **kwargs) - # Preserve async detection on the partial - bound.__name__ = fn.__name__ # type: ignore[attr-defined] - bound.__qualname__ = fn.__qualname__ # type: ignore[attr-defined] - return bound - # No args at all — if no extra params, this IS the evaluator - if not has_extra_params and not args and not kwargs: - return fn - return fn(*args, **kwargs) - - return wrapper + merged = {**self._kwargs, **kwargs} + return self._fn(*args, **merged) + if kwargs: + return Evaluator(self._fn, {**self._kwargs, **kwargs}) + return self + + def evaluator_identity(self) -> dict[str, Any]: + identity: dict[str, Any] = {"fn": self._qualname} + if self._kwargs: + identity["kwargs"] = self._kwargs + return identity + + def __repr__(self) -> str: + if self._kwargs: + kw = ", ".join(f"{k}={v!r}" for k, v in self._kwargs.items()) + return f"Evaluator({self._name}({kw}))" + return f"Evaluator({self._name})" + + +def evaluator(fn: Callable[..., Any]) -> Evaluator: + """Turn a function into a ProTest evaluator.""" + return Evaluator(fn) diff --git a/protest/evals/hashing.py b/protest/evals/hashing.py index 5ebe725..5b3114a 100644 --- a/protest/evals/hashing.py +++ b/protest/evals/hashing.py @@ -15,7 +15,7 @@ import functools import hashlib import json -from typing import Any +from typing import Any, Protocol, runtime_checkable HASH_LENGTH = 12 @@ -24,6 +24,13 @@ class CanonicalError(TypeError): """Raised when an object cannot be converted to a canonical form.""" +@runtime_checkable +class HasEvaluatorIdentity(Protocol): + """Protocol for objects that provide explicit hashing identity.""" + + def evaluator_identity(self) -> dict[str, Any]: ... + + def compute_case_hash(inputs: Any, expected_output: Any) -> str: """Hash the case content (inputs + expected_output).""" data = {"inputs": _canonical(inputs), "expected": _canonical(expected_output)} @@ -56,10 +63,13 @@ def _canonical(obj: Any) -> Any: # noqa: PLR0911 if isinstance(obj, (list, tuple)): return [_canonical(item) for item in obj] if isinstance(obj, dict): - return {str(k): _canonical(v) for k, v in sorted(obj.items())} + return { + str(k): _canonical(v) + for k, v in sorted(obj.items(), key=lambda item: str(item[0])) + } # --- explicit identity (user-controlled) --- - if hasattr(obj, "evaluator_identity"): + if isinstance(obj, HasEvaluatorIdentity): return _canonical(obj.evaluator_identity()) # --- introspection fallback --- diff --git a/protest/evals/wrapper.py b/protest/evals/wrapper.py index e9161f1..d4278c0 100644 --- a/protest/evals/wrapper.py +++ b/protest/evals/wrapper.py @@ -16,6 +16,7 @@ from protest.evals.evaluator import ( EvalCase, EvalContext, + Evaluator, ShortCircuit, extract_scores_from_result, ) @@ -181,7 +182,7 @@ async def run_evaluators( scores.extend(await _run_short_circuit(ev.evaluators, ctx)) continue - evaluator_name = getattr(ev, "__name__", type(ev).__name__) + evaluator_name = ev.name if isinstance(ev, Evaluator) else type(ev).__name__ try: raw = ev(ctx) result = await raw if asyncio.iscoroutine(raw) else raw @@ -199,7 +200,7 @@ async def _run_short_circuit( """Run evaluators in order, stop at first Verdict=False.""" scores: list[EvalScore] = [] for i, ev in enumerate(evaluators): - evaluator_name = getattr(ev, "__name__", type(ev).__name__) + evaluator_name = ev.name if isinstance(ev, Evaluator) else type(ev).__name__ try: raw = ev(ctx) result = await raw if asyncio.iscoroutine(raw) else raw @@ -210,8 +211,10 @@ async def _run_short_circuit( if any(s.is_verdict and not s.passed for s in extracted): # Mark remaining evaluators as skipped for skipped_ev in evaluators[i + 1 :]: - skipped_name = getattr( - skipped_ev, "__name__", type(skipped_ev).__name__ + skipped_name = ( + skipped_ev.name + if isinstance(skipped_ev, Evaluator) + else type(skipped_ev).__name__ ) scores.append(EvalScore(name=skipped_name, value=False, skipped=True)) break diff --git a/protest/history/plugin.py b/protest/history/plugin.py index 930ca61..eac653c 100644 --- a/protest/history/plugin.py +++ b/protest/history/plugin.py @@ -71,7 +71,7 @@ def setup(self, session: ProTestSession) -> None: if isinstance(suite, EvalSuite) and suite.judge is not None: self._eval_judge_info[suite.name] = { "name": suite.judge.name, - "provider": getattr(suite.judge, "provider", None), + "provider": suite.judge.provider, } elif not self._default_suite_name or self._default_suite_name == "tests": self._default_suite_name = suite.name From 6f6d16a79077ad56fb41056a7c983040cd197c2e Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Tue, 14 Apr 2026 21:05:51 +0200 Subject: [PATCH 26/60] refactor(examples): replace dict-based eval cases with `EvalCase` objects in Yorkshire example dataset --- examples/yorkshire/evals/dataset.py | 169 ++++++++++++++-------------- 1 file changed, 85 insertions(+), 84 deletions(-) diff --git a/examples/yorkshire/evals/dataset.py b/examples/yorkshire/evals/dataset.py index 7153ab6..e7ad926 100644 --- a/examples/yorkshire/evals/dataset.py +++ b/examples/yorkshire/evals/dataset.py @@ -3,6 +3,7 @@ from __future__ import annotations from protest import ForEach +from protest.evals import EvalCase from protest.evals.evaluators import ( contains_keywords, does_not_contain, @@ -13,109 +14,109 @@ yorkshire_cases = ForEach( [ # --- Factual recall --- - { - "name": "weight_question", - "inputs": "How much does a Yorkshire Terrier weigh?", - "expected": "2-3 kg", - "metadata": {"tags": ["factual", "size"]}, - "evaluators": [ + EvalCase( + name="weight_question", + inputs="How much does a Yorkshire Terrier weigh?", + expected="2-3 kg", + metadata={"tags": ["factual", "size"]}, + evaluators=[ contains_keywords(keywords=["2-3 kg", "teacup", "mini", "standard"]) ], - }, - { - "name": "grooming_basics", - "inputs": "How often should I brush my Yorkie?", - "expected": "daily brushing for long coats", - "metadata": {"tags": ["factual", "grooming"]}, - "evaluators": [contains_keywords(keywords=["daily", "brushing", "long"])], - }, - { - "name": "diet_advice", - "inputs": "What should I feed my Yorkshire Terrier?", - "expected": "small breed formula, 2-3 meals", - "metadata": {"tags": ["factual", "diet"]}, - "evaluators": [ + ), + EvalCase( + name="grooming_basics", + inputs="How often should I brush my Yorkie?", + expected="daily brushing for long coats", + metadata={"tags": ["factual", "grooming"]}, + evaluators=[contains_keywords(keywords=["daily", "brushing", "long"])], + ), + EvalCase( + name="diet_advice", + inputs="What should I feed my Yorkshire Terrier?", + expected="small breed formula, 2-3 meals", + metadata={"tags": ["factual", "diet"]}, + evaluators=[ contains_keywords(keywords=["small breed", "meals", "avoid"]) ], - }, - { - "name": "exercise_needs", - "inputs": "How much exercise does a Yorkie need?", - "expected": "30 minutes daily", - "metadata": {"tags": ["factual", "exercise"]}, - "evaluators": [contains_keywords(keywords=["30 minutes", "walk"])], - }, + ), + EvalCase( + name="exercise_needs", + inputs="How much exercise does a Yorkie need?", + expected="30 minutes daily", + metadata={"tags": ["factual", "exercise"]}, + evaluators=[contains_keywords(keywords=["30 minutes", "walk"])], + ), # --- Temperament --- - { - "name": "personality", - "inputs": "What is the temperament of a Yorkshire Terrier?", - "expected": "bold, confident, affectionate", - "metadata": {"tags": ["factual", "temperament"]}, - "evaluators": [ + EvalCase( + name="personality", + inputs="What is the temperament of a Yorkshire Terrier?", + expected="bold, confident, affectionate", + metadata={"tags": ["factual", "temperament"]}, + evaluators=[ contains_keywords(keywords=["bold", "confident", "affectionate"]) ], - }, + ), # --- Age-specific --- - { - "name": "puppy_care", - "inputs": "How do I care for a Yorkshire puppy?", - "expected": "extra care, socialization", - "metadata": {"tags": ["factual", "puppies"]}, - "evaluators": [contains_keywords(keywords=["12 months", "socialization"])], - }, - { - "name": "senior_care", - "inputs": "My Yorkie is getting old, what should I change?", - "expected": "adjust exercise, more vet visits", - "metadata": {"tags": ["factual", "seniors"]}, - "evaluators": [contains_keywords(keywords=["senior", "exercise", "vet"])], - }, + EvalCase( + name="puppy_care", + inputs="How do I care for a Yorkshire puppy?", + expected="extra care, socialization", + metadata={"tags": ["factual", "puppies"]}, + evaluators=[contains_keywords(keywords=["12 months", "socialization"])], + ), + EvalCase( + name="senior_care", + inputs="My Yorkie is getting old, what should I change?", + expected="adjust exercise, more vet visits", + metadata={"tags": ["factual", "seniors"]}, + evaluators=[contains_keywords(keywords=["senior", "exercise", "vet"])], + ), # --- Hallucination checks --- - { - "name": "no_cat_advice", - "inputs": "Tell me about Yorkshire Terrier health", - "expected": "dental problems, patellar luxation", - "metadata": {"tags": ["safety"]}, - "evaluators": [ + EvalCase( + name="no_cat_advice", + inputs="Tell me about Yorkshire Terrier health", + expected="dental problems, patellar luxation", + metadata={"tags": ["safety"]}, + evaluators=[ does_not_contain(forbidden=["cat", "feline", "persian"]), contains_keywords(keywords=["dental", "health"]), ], - }, - { - "name": "no_made_up_breeds", - "inputs": "What jobs can a Yorkie do?", - "expected": "therapy dogs, companions", - "metadata": {"tags": ["safety"]}, - "evaluators": [ + ), + EvalCase( + name="no_made_up_breeds", + inputs="What jobs can a Yorkie do?", + expected="therapy dogs, companions", + metadata={"tags": ["safety"]}, + evaluators=[ does_not_contain(forbidden=["labrador", "golden retriever", "poodle"]), contains_keywords(keywords=["therapy", "companion"]), ], - }, + ), # --- Edge cases --- - { - "name": "unknown_topic", - "inputs": "What is the GDP of France?", - "expected": "I'm not sure", - "metadata": {"tags": ["edge_case"]}, - "evaluators": [contains_keywords(keywords=["not sure", "specialize"])], - }, - { - "name": "empty_question", - "inputs": "", - "expected": "I'm not sure", - "metadata": {"tags": ["edge_case"]}, - "evaluators": [contains_keywords(keywords=["not sure"])], - }, + EvalCase( + name="unknown_topic", + inputs="What is the GDP of France?", + expected="I'm not sure", + metadata={"tags": ["edge_case"]}, + evaluators=[contains_keywords(keywords=["not sure", "specialize"])], + ), + EvalCase( + name="empty_question", + inputs="", + expected="I'm not sure", + metadata={"tags": ["edge_case"]}, + evaluators=[contains_keywords(keywords=["not sure"])], + ), # --- Known weak spot (chatbot doesn't know about training treats) --- - { - "name": "training_treats", - "inputs": "What treats are best for training a Yorkie?", - "expected": "small soft treats, positive reinforcement", - "metadata": {"tags": ["factual", "training"]}, - "evaluators": [ + EvalCase( + name="training_treats", + inputs="What treats are best for training a Yorkie?", + expected="small soft treats, positive reinforcement", + metadata={"tags": ["factual", "training"]}, + evaluators=[ contains_keywords(keywords=["treats", "small", "soft", "reward"]) ], - }, + ), ] ) From 1d42252ffa3d3e20a2c59f72168b18223d763baa Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Tue, 14 Apr 2026 21:53:44 +0200 Subject: [PATCH 27/60] docs(evals): clarify that `EvalCase` must replace plain dicts --- docs/evals.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/evals.md b/docs/evals.md index 387267f..96e51d7 100644 --- a/docs/evals.md +++ b/docs/evals.md @@ -68,7 +68,7 @@ async def chatbot(case: Annotated[EvalCase, From(cases)]) -> str: ## EvalCase -Typed dataclass for eval case data. Provides IDE autocompletion instead of untyped dicts. +Typed dataclass for eval case data. All eval cases **must** use `EvalCase` — plain dicts are not supported. ```python from protest.evals import EvalCase From 9a4ce43eb4f9fb672b5138f854474b5b269c6d14 Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Fri, 24 Apr 2026 07:37:24 +0200 Subject: [PATCH 28/60] refactor(reporting): centralize shared formatting logic and add CLI options for output/log visibility - Moved duplicate formatting helpers (`format_duration`, `format_usage`) to a new `format` module for reuse. - Added `--show-output` and `--show-logs` CLI options to enhance reporting flexibility. - Updated tests to verify symmetry between Rich and ASCII reporters for structural, CLI, and behavioral consistency. --- examples/yorkshire/evals/dataset.py | 4 +- protest/cli/main.py | 15 + protest/reporting/ascii.py | 115 ++++--- protest/reporting/format.py | 39 +++ protest/reporting/rich_reporter.py | 56 +--- tests/reporting/test_reporter_symmetry.py | 369 ++++++++++++++++++++++ tests/reporting/test_rich_reporter.py | 8 +- 7 files changed, 511 insertions(+), 95 deletions(-) create mode 100644 protest/reporting/format.py create mode 100644 tests/reporting/test_reporter_symmetry.py diff --git a/examples/yorkshire/evals/dataset.py b/examples/yorkshire/evals/dataset.py index e7ad926..89b3362 100644 --- a/examples/yorkshire/evals/dataset.py +++ b/examples/yorkshire/evals/dataset.py @@ -35,9 +35,7 @@ inputs="What should I feed my Yorkshire Terrier?", expected="small breed formula, 2-3 meals", metadata={"tags": ["factual", "diet"]}, - evaluators=[ - contains_keywords(keywords=["small breed", "meals", "avoid"]) - ], + evaluators=[contains_keywords(keywords=["small breed", "meals", "avoid"])], ), EvalCase( name="exercise_needs", diff --git a/protest/cli/main.py b/protest/cli/main.py index 648fd26..9c0b324 100644 --- a/protest/cli/main.py +++ b/protest/cli/main.py @@ -230,6 +230,21 @@ def _create_run_parser() -> argparse.ArgumentParser: default=0, help="Increase verbosity (-v for lifecycle, -vv for fixtures)", ) + parser.add_argument( + "--show-output", + dest="show_output", + action="store_true", + help="Show eval inputs/output/expected per case", + ) + parser.add_argument( + "--show-logs", + dest="show_logs", + nargs="?", + const="INFO", + default=None, + metavar="LEVEL", + help="Show captured log records (default: INFO+)", + ) return parser diff --git a/protest/reporting/ascii.py b/protest/reporting/ascii.py index 9296ae6..64470b8 100644 --- a/protest/reporting/ascii.py +++ b/protest/reporting/ascii.py @@ -1,3 +1,4 @@ +import logging import sys import traceback from pathlib import Path @@ -23,6 +24,12 @@ ) from protest.evals.types import EvalSuiteReport from protest.plugin import PluginBase, PluginContext +from protest.reporting.format import ( + format_duration as _format_duration, +) +from protest.reporting.format import ( + format_usage as _format_usage, +) from protest.reporting.verbosity import Verbosity _MIN_NODE_ID_PARTS = 2 @@ -51,38 +58,23 @@ def _format_test_name(result: TestResult, include_suite: bool = False) -> str: return name -MIN_DURATION_THRESHOLD = 0.001 - - -def _format_duration(seconds: float) -> str: - """Format duration: ms for fast, s for slow.""" - if seconds < MIN_DURATION_THRESHOLD: - return "<1ms" - if seconds < 1: - return f"{seconds * 1000:.0f}ms" - return f"{seconds:.2f}s" - - -_TOKEN_K_THRESHOLD = 1000 - - -def _format_tokens(tokens: int) -> str: - return ( - f"{tokens / _TOKEN_K_THRESHOLD:.1f}k" - if tokens >= _TOKEN_K_THRESHOLD - else str(tokens) - ) - - -def _format_usage(input_tokens: int, output_tokens: int, cost: float) -> str: +def _format_eval_scores_inline(result: TestResult) -> str: + """Format eval scores for inline display — ASCII version (no glyphs).""" + if not result.eval_payload: + return "" parts: list[str] = [] - if input_tokens > 0 or output_tokens > 0: - parts.append( - f"{_format_tokens(input_tokens)} in / {_format_tokens(output_tokens)} out" - ) - if cost > 0: - parts.append(f"${cost:.4f}") - return ", ".join(parts) + for name, entry in result.eval_payload.scores.items(): + if entry.skipped: + parts.append(f"{name}=skip") + continue + val = entry.value + if isinstance(val, bool): + parts.append(f"{name}={'pass' if val else 'fail'}") + elif isinstance(val, float): + parts.append(f"{name}={val:.2f}") + else: + parts.append(f"{name}={val}") + return f" {' '.join(parts)}" if parts else "" class AsciiReporter(PluginBase): @@ -91,8 +83,15 @@ class AsciiReporter(PluginBase): name = "ascii-reporter" description = "Plain ASCII reporter" - def __init__(self, verbosity: int = 0) -> None: + def __init__( + self, + verbosity: int = 0, + show_logs: str | None = None, + show_output: bool = False, + ) -> None: self._verbosity = verbosity + self._show_logs = show_logs + self._show_output = show_output self._is_parallel = False self._failed_results: list[TestResult] = [] self._error_results: list[TestResult] = [] @@ -100,9 +99,36 @@ def __init__(self, verbosity: int = 0) -> None: @classmethod def activate(cls, ctx: PluginContext) -> Self | None: if ctx.get("no_color", False): - return cls(verbosity=ctx.get("verbosity", 0)) + return cls( + verbosity=ctx.get("verbosity", 0), + show_logs=ctx.get("show_logs"), + show_output=ctx.get("show_output", False), + ) return None + def _print_eval_detail(self, result: TestResult) -> None: + """Print eval inputs/output/expected (enabled by --show-output or on failure).""" + p = result.eval_payload + if not p: + return + if p.inputs is not None: + print(f" | inputs: {str(p.inputs)[:200]}") + if p.output is not None: + print(f" | output: {str(p.output)[:200]}") + if p.expected_output is not None: + print(f" | expected: {str(p.expected_output)[:200]}") + + def _maybe_show_logs(self, result: TestResult) -> None: + """Show captured log records if --show-logs is active.""" + if not self._show_logs or not result.log_records: + return + min_level = getattr(logging, self._show_logs.upper(), logging.INFO) + for record in result.log_records: + if record.levelno >= min_level: + print( + f" LOG [{record.levelname}] {record.name}: {record.getMessage()}" + ) + def on_collection_finish(self, items: list[TestItem]) -> list[TestItem]: self._is_parallel = len(items) > 1 return items @@ -193,7 +219,11 @@ def on_test_pass(self, result: TestResult) -> None: retry_suffix = "" if result.max_attempts > 1: retry_suffix = f" [attempt {result.attempt}/{result.max_attempts}]" - print(f" OK {name} ({duration}){retry_suffix}") + scores_str = _format_eval_scores_inline(result) if result.is_eval else "" + print(f" OK {name} ({duration}){scores_str}{retry_suffix}") + if self._show_output and result.is_eval: + self._print_eval_detail(result) + self._maybe_show_logs(result) def on_test_fail(self, result: TestResult) -> None: name = _format_test_name(result, include_suite=self._is_parallel) @@ -216,6 +246,9 @@ def on_test_fail(self, result: TestResult) -> None: if result.output: for line in result.output.rstrip().splitlines(): print(f" | {line}") + if result.is_eval: + self._print_eval_detail(result) + self._maybe_show_logs(result) def on_test_skip(self, result: TestResult) -> None: if self._verbosity >= Verbosity.NORMAL: @@ -257,14 +290,16 @@ def _format_traceback(self, error: Exception) -> str: return "".join(lines) def _print_failure_summary(self) -> None: - if self._failed_results: + non_eval_failures = [r for r in self._failed_results if not r.is_eval] + if non_eval_failures: print("\n=== FAILURES ===") - for result in self._failed_results: + for result in non_eval_failures: self._print_failure_detail(result, is_error=False) - if self._error_results: + non_eval_errors = [r for r in self._error_results if not r.is_eval] + if non_eval_errors: print("\n=== ERRORS ===") - for result in self._error_results: + for result in non_eval_errors: self._print_failure_detail(result, is_error=True) def _print_failure_detail(self, result: TestResult, *, is_error: bool) -> None: @@ -324,7 +359,9 @@ def on_eval_suite_end(self, report: Any) -> None: print() def on_session_complete(self, result: SessionResult) -> None: - if self._failed_results or self._error_results: + has_non_eval_failures = any(not r.is_eval for r in self._failed_results) + has_non_eval_errors = any(not r.is_eval for r in self._error_results) + if has_non_eval_failures or has_non_eval_errors: self._print_failure_summary() total = ( diff --git a/protest/reporting/format.py b/protest/reporting/format.py new file mode 100644 index 0000000..6e23151 --- /dev/null +++ b/protest/reporting/format.py @@ -0,0 +1,39 @@ +"""Shared formatting helpers used by both Rich and Ascii reporters. + +Only formats that are *truly identical* between the two reporters live here. +Visual rendering (glyphs vs ASCII words, colors) stays in each reporter. +""" + +from __future__ import annotations + +MIN_DURATION_THRESHOLD = 0.001 +_TOKEN_K_THRESHOLD = 1000 + + +def format_duration(seconds: float) -> str: + if seconds < MIN_DURATION_THRESHOLD: + return "<1ms" + if seconds < 1: + return f"{seconds * 1000:.0f}ms" + return f"{seconds:.2f}s" + + +def format_tokens(tokens: int) -> str: + """Format token count: 1234 → '1.2k', 45 → '45'.""" + return ( + f"{tokens / _TOKEN_K_THRESHOLD:.1f}k" + if tokens >= _TOKEN_K_THRESHOLD + else str(tokens) + ) + + +def format_usage(input_tokens: int, output_tokens: int, cost: float) -> str: + """Format usage stats as 'Xk in / Yk out, $0.0042'.""" + parts: list[str] = [] + if input_tokens > 0 or output_tokens > 0: + parts.append( + f"{format_tokens(input_tokens)} in / {format_tokens(output_tokens)} out" + ) + if cost > 0: + parts.append(f"${cost:.4f}") + return ", ".join(parts) diff --git a/protest/reporting/rich_reporter.py b/protest/reporting/rich_reporter.py index 506641d..5794457 100644 --- a/protest/reporting/rich_reporter.py +++ b/protest/reporting/rich_reporter.py @@ -26,6 +26,12 @@ ) from protest.evals.types import EvalSuiteReport from protest.plugin import PluginBase, PluginContext +from protest.reporting.format import ( + format_duration as _format_duration, +) +from protest.reporting.format import ( + format_usage as _format_usage, +) from protest.reporting.verbosity import Verbosity @@ -42,41 +48,6 @@ def _format_test_name(result: TestResult) -> str: return label.replace("[", "\\[") -MIN_DURATION_THRESHOLD = 0.001 - - -def _format_duration(seconds: float) -> str: - if seconds < MIN_DURATION_THRESHOLD: - return "<1ms" - if seconds < 1: - return f"{seconds * 1000:.0f}ms" - return f"{seconds:.2f}s" - - -_TOKEN_K_THRESHOLD = 1000 - - -def _format_tokens(tokens: int) -> str: - """Format token count: 1234 → '1.2k', 45 → '45'.""" - return ( - f"{tokens / _TOKEN_K_THRESHOLD:.1f}k" - if tokens >= _TOKEN_K_THRESHOLD - else str(tokens) - ) - - -def _format_usage(input_tokens: int, output_tokens: int, cost: float) -> str: - """Format usage stats as 'Xk in / Yk out, $0.0042'.""" - parts: list[str] = [] - if input_tokens > 0 or output_tokens > 0: - parts.append( - f"{_format_tokens(input_tokens)} in / {_format_tokens(output_tokens)} out" - ) - if cost > 0: - parts.append(f"${cost:.4f}") - return ", ".join(parts) - - def _format_eval_scores_inline(result: TestResult) -> str: """Format eval scores for inline display (e.g. ' bg_score=0.8 char_id=1.0').""" if not result.eval_payload: @@ -131,21 +102,6 @@ def add_cli_options(cls, parser: ArgumentParser) -> None: action="store_true", help="Disable colors (plain ASCII output)", ) - group.add_argument( - "--show-logs", - dest="show_logs", - nargs="?", - const="INFO", - default=None, - metavar="LEVEL", - help="Show captured log records (default: INFO+)", - ) - group.add_argument( - "--show-output", - dest="show_output", - action="store_true", - help="Show eval inputs/output/expected per case", - ) @classmethod def activate(cls, ctx: PluginContext) -> Self | None: diff --git a/tests/reporting/test_reporter_symmetry.py b/tests/reporting/test_reporter_symmetry.py new file mode 100644 index 0000000..4eec50c --- /dev/null +++ b/tests/reporting/test_reporter_symmetry.py @@ -0,0 +1,369 @@ +"""Symmetry tests between RichReporter and AsciiReporter. + +Goal: catch divergences between the two reporters before they ship as silent +asymmetries. A user who swaps `--no-color` should get the same *semantic* +output (same fields, same filters) — only the visual style differs. + +Three axes are enforced: + +1. Structural — both reporters expose the same public hooks (`on_*` handlers). +2. CLI — both reporters react to the same shared flags (`--show-output`, + `--show-logs`). Reporter-specific flags (`--no-color`) are allowed. +3. Behavioral — parametrized scenarios drive the same input through both + reporters and assert the same *semantic* markers appear + (score names for eval pass, eval detail on fail, summary omits + eval failures, etc.). +""" + +from __future__ import annotations + +import argparse +import inspect +import logging +from typing import Any + +import pytest + +from protest.entities import ( + EvalPayload, + EvalScoreEntry, + SessionResult, + TestResult, +) +from protest.plugin import PluginBase, PluginContext +from protest.reporting.ascii import AsciiReporter +from protest.reporting.rich_reporter import RichReporter + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +REPORTER_CLASSES = [RichReporter, AsciiReporter] + +# CLI flags that are handled by the shared run-parser (not by either reporter's +# add_cli_options). Both reporters must still read them via their activate(). +_SHARED_CLI_FLAGS = {"show_output", "show_logs"} + + +def _public_handlers(cls: type[PluginBase]) -> set[str]: + """Return the set of `on_*` handlers defined or overridden on cls. + + Only include methods that are *actually overridden* (not inherited from + PluginBase as no-ops). That's what makes the reporter visible to the bus. + """ + handlers: set[str] = set() + for name, member in inspect.getmembers(cls, predicate=inspect.isfunction): + if not name.startswith("on_"): + continue + # Skip no-op base implementations that a subclass didn't override. + base_member = getattr(PluginBase, name, None) + if base_member is not None and member is base_member: + continue + handlers.add(name) + return handlers + + +def _cli_flag_dests(cls: type[PluginBase]) -> set[str]: + """Return the argparse `dest` names registered by cls.add_cli_options.""" + parser = argparse.ArgumentParser() + cls.add_cli_options(parser) + dests: set[str] = set() + for action in parser._actions: + if action.dest and action.dest != "help": + dests.add(action.dest) + return dests + + +def _make_reporter(cls: type[PluginBase], **kwargs: Any) -> PluginBase: + """Activate a reporter via its own activate() path to exercise wiring.""" + ctx_args = {"no_color": cls is AsciiReporter, "verbosity": 1, **kwargs} + ctx = PluginContext(args=ctx_args) + instance = cls.activate(ctx) + assert instance is not None, f"{cls.__name__}.activate returned None" + return instance + + +def _capture_output(capsys: pytest.CaptureFixture[str]) -> str: + """Capture everything captured so far on stdout+stderr. + + Rich writes via `self.console` (stdout by default), Ascii uses `print`. + Capsys grabs both uniformly. + """ + captured = capsys.readouterr() + return captured.out + captured.err + + +@pytest.fixture +def eval_result_pass() -> TestResult: + """A passing eval TestResult with two scores (one bool, one float).""" + return TestResult( + name="case_alpha", + node_id="mod::chatbot::case_alpha", + duration=0.05, + is_eval=True, + eval_payload=EvalPayload( + case_name="case_alpha", + passed=True, + task_duration=0.05, + inputs="hello", + output="world", + expected_output="world", + scores={ + "contains_world": EvalScoreEntry(value=True, passed=True), + "similarity": EvalScoreEntry(value=0.92, passed=True), + }, + ), + ) + + +@pytest.fixture +def eval_result_fail() -> TestResult: + """A failing eval TestResult (one score fails).""" + return TestResult( + name="case_beta", + node_id="mod::chatbot::case_beta", + duration=0.04, + error=AssertionError("score contains_hi failed"), + is_eval=True, + eval_payload=EvalPayload( + case_name="case_beta", + passed=False, + task_duration=0.04, + inputs="goodbye", + output="farewell", + expected_output="hi", + scores={ + "contains_hi": EvalScoreEntry(value=False, passed=False), + }, + ), + ) + + +@pytest.fixture +def plain_failing_test() -> TestResult: + return TestResult( + name="test_plain_fail", + node_id="mod::test_plain_fail", + duration=0.01, + error=AssertionError("plain failure"), + ) + + +# --------------------------------------------------------------------------- +# 1. Structural symmetry +# --------------------------------------------------------------------------- + + +class TestStructuralSymmetry: + """Ensure the two reporters expose the same public handler surface.""" + + def test_reporters_override_same_handlers(self) -> None: + """Both reporters override the same set of on_* methods. + + If one reporter starts overriding a hook that the other ignores, an + event will be invisible in the "other" reporter — that's the bug we + want to catch at test time, not in production. + """ + rich_handlers = _public_handlers(RichReporter) + ascii_handlers = _public_handlers(AsciiReporter) + + only_in_rich = rich_handlers - ascii_handlers + only_in_ascii = ascii_handlers - rich_handlers + assert not only_in_rich, ( + f"Rich handles events that Ascii doesn't: {sorted(only_in_rich)}" + ) + assert not only_in_ascii, ( + f"Ascii handles events that Rich doesn't: {sorted(only_in_ascii)}" + ) + + +# --------------------------------------------------------------------------- +# 2. CLI symmetry +# --------------------------------------------------------------------------- + + +class TestCliSymmetry: + """Ensure the two reporters consume the same shared flags. + + Reporter-specific flags are allowed (e.g. `--no-color` makes sense only on + the Ascii side) — they're expected to appear in either one but not both. + The rule is: anything in `_SHARED_CLI_FLAGS` must be *activatable* on both + reporters (read from PluginContext via activate()). + """ + + @pytest.mark.parametrize( + "flag,value,attr", + [ + pytest.param("show_output", True, "_show_output", id="show_output"), + pytest.param("show_logs", "INFO", "_show_logs", id="show_logs"), + ], + ) + def test_shared_flags_reach_both_reporters( + self, flag: str, value: Any, attr: str + ) -> None: + """Given a shared run-parser flag, both reporters pick it up via activate().""" + for cls in REPORTER_CLASSES: + reporter = _make_reporter(cls, **{flag: value}) + assert getattr(reporter, attr) == value, ( + f"{cls.__name__} didn't wire flag '{flag}' into attr '{attr}'" + ) + + def test_reporters_dont_redeclare_shared_flags(self) -> None: + """Shared flags live on the run-parser, not on reporter add_cli_options. + + If either reporter redeclares them via add_cli_options, argparse will + raise at runtime when both get wired (cli/main.py iterates plugin + classes and calls add_cli_options on each). + """ + for cls in REPORTER_CLASSES: + dests = _cli_flag_dests(cls) + redeclared = dests & _SHARED_CLI_FLAGS + assert not redeclared, ( + f"{cls.__name__}.add_cli_options redeclares shared flags: " + f"{sorted(redeclared)} — move them to cli._create_run_parser" + ) + + +# --------------------------------------------------------------------------- +# 3. Behavioral symmetry +# --------------------------------------------------------------------------- + + +class TestBehavioralSymmetry: + """Drive the same events through both reporters; assert same semantics. + + We deliberately avoid asserting on *exact* characters: the visual prefix + differs (`✓` vs `OK`, colors vs plain). What must be identical is which + pieces of information are rendered. + """ + + @pytest.mark.parametrize("reporter_cls", REPORTER_CLASSES) + def test_eval_pass_shows_score_names_inline( + self, + reporter_cls: type[PluginBase], + eval_result_pass: TestResult, + capsys: pytest.CaptureFixture[str], + ) -> None: + """Given a passing eval, both reporters surface each score's name inline.""" + reporter = _make_reporter(reporter_cls, verbosity=1) + reporter.on_test_pass(eval_result_pass) + output = _capture_output(capsys) + assert "contains_world" in output, ( + f"{reporter_cls.__name__}: missing score name" + ) + assert "similarity" in output, f"{reporter_cls.__name__}: missing float score" + + @pytest.mark.parametrize("reporter_cls", REPORTER_CLASSES) + def test_eval_fail_shows_detail_inline( + self, + reporter_cls: type[PluginBase], + eval_result_fail: TestResult, + capsys: pytest.CaptureFixture[str], + ) -> None: + """Given a failing eval, both reporters dump inputs/output/expected. + + This must happen regardless of --show-output — the user can't debug + a failed assertion without seeing what the task actually produced. + """ + reporter = _make_reporter(reporter_cls) + reporter.on_test_fail(eval_result_fail) + output = _capture_output(capsys) + assert "goodbye" in output, f"{reporter_cls.__name__}: missing inputs" + assert "farewell" in output, f"{reporter_cls.__name__}: missing output" + assert "hi" in output, f"{reporter_cls.__name__}: missing expected" + + @pytest.mark.parametrize("reporter_cls", REPORTER_CLASSES) + def test_show_output_true_prints_eval_detail_on_pass( + self, + reporter_cls: type[PluginBase], + eval_result_pass: TestResult, + capsys: pytest.CaptureFixture[str], + ) -> None: + """Given --show-output, both reporters print eval detail even on pass.""" + reporter = _make_reporter(reporter_cls, verbosity=1, show_output=True) + reporter.on_test_pass(eval_result_pass) + output = _capture_output(capsys) + assert "hello" in output, f"{reporter_cls.__name__}: missing inputs on pass" + assert "world" in output, f"{reporter_cls.__name__}: missing output on pass" + + @pytest.mark.parametrize("reporter_cls", REPORTER_CLASSES) + def test_show_output_false_omits_eval_detail_on_pass( + self, + reporter_cls: type[PluginBase], + eval_result_pass: TestResult, + capsys: pytest.CaptureFixture[str], + ) -> None: + """Given default --show-output, eval detail is hidden on pass.""" + reporter = _make_reporter(reporter_cls, verbosity=1, show_output=False) + reporter.on_test_pass(eval_result_pass) + output = _capture_output(capsys) + # "hello" and "world" appear in the score name ("contains_world"); + # assert on a unique substring that only appears if the detail block runs. + assert "inputs:" not in output, ( + f"{reporter_cls.__name__}: leaked eval detail without --show-output" + ) + + @pytest.mark.parametrize("reporter_cls", REPORTER_CLASSES) + def test_failure_summary_omits_eval_failures( + self, + reporter_cls: type[PluginBase], + eval_result_fail: TestResult, + plain_failing_test: TestResult, + capsys: pytest.CaptureFixture[str], + ) -> None: + """End-of-session summary must not re-list eval failures. + + Eval cases are already displayed inline via on_test_fail. Re-listing + them in the summary duplicates noise — the pattern agreed on is + "non_eval_failures only". + """ + reporter = _make_reporter(reporter_cls) + reporter.on_test_fail(eval_result_fail) + reporter.on_test_fail(plain_failing_test) + capsys.readouterr() # drop inline fail output + + reporter.on_session_complete( + SessionResult(passed=0, failed=2, errors=0, duration=1.0) + ) + summary = _capture_output(capsys) + + assert "test_plain_fail" in summary, ( + f"{reporter_cls.__name__}: summary lost the plain failure" + ) + # The eval case name should NOT appear in the failure-summary block. + # It may appear in the inline tally above; we only captured summary here. + assert "case_beta" not in summary, ( + f"{reporter_cls.__name__}: summary re-lists eval failure (should be inline only)" + ) + + @pytest.mark.parametrize("reporter_cls", REPORTER_CLASSES) + def test_show_logs_prints_captured_records( + self, + reporter_cls: type[PluginBase], + capsys: pytest.CaptureFixture[str], + ) -> None: + """Given --show-logs INFO, both reporters emit the captured log records.""" + record = logging.LogRecord( + name="mylib.module", + level=logging.INFO, + pathname="x.py", + lineno=1, + msg="captured thing", + args=(), + exc_info=None, + ) + result = TestResult( + name="test_foo", + node_id="mod::test_foo", + duration=0.01, + log_records=(record,), + ) + reporter = _make_reporter(reporter_cls, verbosity=1, show_logs="INFO") + reporter.on_test_pass(result) + output = _capture_output(capsys) + assert "captured thing" in output, ( + f"{reporter_cls.__name__}: --show-logs didn't render the record" + ) + assert "mylib.module" in output, ( + f"{reporter_cls.__name__}: --show-logs didn't render the logger name" + ) diff --git a/tests/reporting/test_rich_reporter.py b/tests/reporting/test_rich_reporter.py index 1452579..4585338 100644 --- a/tests/reporting/test_rich_reporter.py +++ b/tests/reporting/test_rich_reporter.py @@ -15,11 +15,13 @@ TestRetryInfo, ) from protest.events.types import Event -from protest.reporting.rich_reporter import ( +from protest.reporting.format import ( MIN_DURATION_THRESHOLD, - RichReporter, - _format_duration, ) +from protest.reporting.format import ( + format_duration as _format_duration, +) +from protest.reporting.rich_reporter import RichReporter class TestFormatDuration: From ef0c1765fb80a9777531f7ba657f69549329ffaa Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Fri, 24 Apr 2026 07:53:38 +0200 Subject: [PATCH 29/60] refactor(reporting, examples, core): add `_safe_repr` for JSON-safe string handling and extend eval support - Introduced `_safe_repr` utility to safely truncate and render arbitrary objects for JSON serialization in web reporting. - Updated Yorkshire example to showcase `EvalSuite` API for cleaner and type-safe eval case definitions. - Added `KindFilterPlugin` for improved filtering capabilities in core session logic. - Enhanced eval case serialization to exclude skipped scores, improving `history --compare` accuracy. --- examples/yorkshire/session.py | 26 ++++++++++++++++---------- protest/api.py | 2 ++ protest/history/plugin.py | 23 +++++++++++++++++++---- protest/reporting/web.py | 22 +++++++++++++++++++--- 4 files changed, 56 insertions(+), 17 deletions(-) diff --git a/examples/yorkshire/session.py b/examples/yorkshire/session.py index 7b8c3c3..b723cb9 100644 --- a/examples/yorkshire/session.py +++ b/examples/yorkshire/session.py @@ -5,14 +5,16 @@ Run only tests: protest run examples.yorkshire.session:session - (protest run filters to kind=test by default) + (protest run filters to kind=test) Run only evals: protest eval examples.yorkshire.session:session """ +from typing import Annotated + from examples.yorkshire.app.chatbot import yorkshire_chatbot -from examples.yorkshire.evals.dataset import dataset +from examples.yorkshire.evals.dataset import suite_evaluators, yorkshire_cases from examples.yorkshire.tests.fixtures import ( configure_kennel_logging, kennel, @@ -26,8 +28,9 @@ from examples.yorkshire.tests.suites.rate_limited import rate_limited_suite from examples.yorkshire.tests.suites.seniors.suite import seniors_suite from examples.yorkshire.tests.suites.showcase.suite import showcase_suite -from protest import ProTestSession -from protest.evals import ModelInfo +from protest import From, ProTestSession +from protest.evals import EvalCase, ModelInfo +from protest.evals.suite import EvalSuite session = ProTestSession(concurrency=4, history=True) session.use(BarkPlugin) @@ -35,7 +38,6 @@ session.bind(kennel) session.bind(yorkshire) -# Tests session.add_suite(puppies_suite) session.add_suite(adults_suite) session.add_suite(seniors_suite) @@ -44,9 +46,13 @@ session.add_suite(rate_limited_suite) session.add_suite(custom_factory_suite) -# Evals -session.configure_evals(model=ModelInfo(name="yorkshire-chatbot-v1", provider="local")) -session.register_dataset( - dataset, - task=yorkshire_chatbot, +yorkshire_suite = EvalSuite( + "yorkshire_eval", + model=ModelInfo(name="yorkshire-chatbot-v1", provider="local"), ) +session.add_suite(yorkshire_suite) + + +@yorkshire_suite.eval(evaluators=suite_evaluators) +def yorkshire_eval(case: Annotated[EvalCase, From(yorkshire_cases)]) -> str: + return yorkshire_chatbot(case.inputs) diff --git a/protest/api.py b/protest/api.py index ce8c178..7b1e169 100644 --- a/protest/api.py +++ b/protest/api.py @@ -28,6 +28,7 @@ def test_example(): ) from protest.events.types import Event from protest.filters.keyword import KeywordFilterPlugin +from protest.filters.kind import KindFilterPlugin from protest.filters.suite import SuiteFilterPlugin from protest.plugin import PluginBase, PluginContext from protest.tags.plugin import TagFilterPlugin @@ -150,6 +151,7 @@ def collect_tests( # noqa: PLR0913 - public API with many optional params TagFilterPlugin, SuiteFilterPlugin, KeywordFilterPlugin, + KindFilterPlugin, ] for plugin_class in filter_plugins: instance = plugin_class.activate(ctx) diff --git a/protest/history/plugin.py b/protest/history/plugin.py index eac653c..92b7942 100644 --- a/protest/history/plugin.py +++ b/protest/history/plugin.py @@ -206,19 +206,34 @@ def on_session_end(self, result: Any) -> None: def _serialize_eval_case(case: EvalCaseResult) -> dict[str, Any]: - """Serialize an eval case result for JSONL storage.""" + """Serialize an eval case result for JSONL storage. + + Skipped scores are excluded: a ShortCircuit skip produces + `EvalScore(value=False, skipped=True)` — serializing it as an assertion + would look like a real failure in `history --compare` diffs. + """ entry: dict[str, Any] = { "passed": case.passed, "is_error": case.is_error, "duration": round(case.duration, 3), - "scores": {s.name: s.value for s in case.scores if s.is_metric}, + "scores": { + s.name: s.value for s in case.scores if s.is_metric and not s.skipped + }, "case_hash": case.case_hash, "eval_hash": case.eval_hash, } - labels = {s.name: s.value for s in case.scores if isinstance(s.value, str)} + labels = { + s.name: s.value + for s in case.scores + if isinstance(s.value, str) and not s.skipped + } if labels: entry["labels"] = labels - assertions = {s.name: s.value for s in case.scores if isinstance(s.value, bool)} + assertions = { + s.name: s.value + for s in case.scores + if isinstance(s.value, bool) and not s.skipped + } if assertions: entry["assertions"] = assertions return entry diff --git a/protest/reporting/web.py b/protest/reporting/web.py index 5eb7119..0b6f915 100644 --- a/protest/reporting/web.py +++ b/protest/reporting/web.py @@ -51,6 +51,22 @@ _broadcast_clients: set[Any] = set() +_REPR_LIMIT = 2048 + + +def _safe_repr(value: Any) -> str | None: + """Render an arbitrary value as a JSON-safe string, capped at _REPR_LIMIT.""" + if value is None: + return None + try: + text = str(value) + except Exception as exc: + text = f"" + if len(text) > _REPR_LIMIT: + text = text[:_REPR_LIMIT] + f"... " + return text + + def _format_traceback(error: Exception) -> str: lines = traceback.format_exception(type(error), error, error.__traceback__) return "".join(lines) @@ -309,9 +325,9 @@ def _result_payload( payload["evalPayload"] = { "caseName": ep.case_name, "passed": ep.passed, - "inputs": ep.inputs, - "output": ep.output, - "expected": ep.expected_output, + "inputs": _safe_repr(ep.inputs), + "output": _safe_repr(ep.output), + "expected": _safe_repr(ep.expected_output), "scores": { name: { "value": entry.value, From 67c4887c22c0394395ce6642f0c4646db0ae726a Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Fri, 24 Apr 2026 19:28:08 +0200 Subject: [PATCH 30/60] tests: add coverage for `EvalCase` invariants, `history --compare` logic, and CLI argument parsing mutex validations --- docs/evals.md | 6 +- protest/cli/history.py | 22 ++++-- protest/evals/evaluator.py | 19 +++-- protest/evals/wrapper.py | 2 +- tests/evals/test_evalcase.py | 36 ++++++++++ tests/test_history_changes.py | 128 ++++++++++++++++++++++++++++++++++ tests/test_history_cli.py | 112 +++++++++++++++++++++++++++++ 7 files changed, 312 insertions(+), 13 deletions(-) create mode 100644 tests/evals/test_evalcase.py create mode 100644 tests/test_history_changes.py create mode 100644 tests/test_history_cli.py diff --git a/docs/evals.md b/docs/evals.md index 96e51d7..562bcde 100644 --- a/docs/evals.md +++ b/docs/evals.md @@ -193,8 +193,8 @@ The judge handles structured output — no text parsing needed. See [Judge](#jud Different thresholds per case = different evaluator bindings: ```python -EvalCase(inputs="easy lookup", evaluators=[keyword_check(keywords=["paris"], min_recall=0.9)]), -EvalCase(inputs="hard causal", evaluators=[keyword_check(keywords=["paris"], min_recall=0.3)]), +EvalCase(name="easy_lookup", inputs="easy lookup", evaluators=[keyword_check(keywords=["paris"], min_recall=0.9)]), +EvalCase(name="hard_causal", inputs="hard causal", evaluators=[keyword_check(keywords=["paris"], min_recall=0.3)]), ``` ### ShortCircuit @@ -225,7 +225,7 @@ evaluators=[not_empty] evaluators=[keyword_check(keywords=["python", "async"], min_recall=0.75)] # Per-case evaluators (added to suite-level) -EvalCase(inputs="...", evaluators=[llm_judge(rubric="Check factual accuracy")]) +EvalCase(name="factual_accuracy_case", inputs="...", evaluators=[llm_judge(rubric="Check factual accuracy")]) ``` ### EvalContext diff --git a/protest/cli/history.py b/protest/cli/history.py index e83216d..01198d8 100644 --- a/protest/cli/history.py +++ b/protest/cli/history.py @@ -20,8 +20,12 @@ def handle_history_command(argv: list[str]) -> None: ) parser.add_argument("--model", type=str, default=None, help="Filter by model name") parser.add_argument("--suite", type=str, default=None, help="Filter by suite name") - parser.add_argument("--runs", action="store_true", help="Show run-by-run list") - parser.add_argument( + + action_group = parser.add_mutually_exclusive_group() + action_group.add_argument( + "--runs", action="store_true", help="Show run-by-run list" + ) + action_group.add_argument( "--show", nargs="?", const=0, @@ -30,11 +34,13 @@ def handle_history_command(argv: list[str]) -> None: metavar="N", help="Detailed panel for Nth most recent run (0=latest)", ) - parser.add_argument( + action_group.add_argument( "--compare", action="store_true", help="Compare 2 most recent runs" ) - parser.add_argument("--evals", action="store_true", help="Eval runs only") - parser.add_argument("--tests", action="store_true", help="Test runs only") + + kind_group = parser.add_mutually_exclusive_group() + kind_group.add_argument("--evals", action="store_true", help="Eval runs only") + kind_group.add_argument("--tests", action="store_true", help="Test runs only") parser.add_argument( "--clean-dirty", action="store_true", @@ -313,6 +319,7 @@ def compare(self, current: dict[str, Any], previous: dict[str, Any]) -> None: ("regressed", "Regressions", "red", "-"), ("modified", "Modified", "yellow", "⟳"), ("new", "New", "cyan", "*"), + ("deleted", "Deleted", "red", "✗"), ] has_any = False for key, label, color, marker in labels: @@ -485,6 +492,7 @@ def _classify_changes( "regressed": [], "modified": [], "new": [], + "deleted": [], } for name, curr in curr_cases.items(): prev = prev_cases.get(name) @@ -498,6 +506,9 @@ def _classify_changes( result["fixed"].append(name) elif not curr.get("passed") and prev.get("passed"): result["regressed"].append(name) + for name in prev_cases: + if name not in curr_cases: + result["deleted"].append(name) return result @@ -507,6 +518,7 @@ def _print_changes(changes: dict[str, list[str]]) -> None: "regressed": ("Regressions", "-"), "modified": ("Modified", "⟳"), "new": ("New", "*"), + "deleted": ("Deleted", "✗"), } has_any = False for key, (label, marker) in labels.items(): diff --git a/protest/evals/evaluator.py b/protest/evals/evaluator.py index 569ce83..242bb64 100644 --- a/protest/evals/evaluator.py +++ b/protest/evals/evaluator.py @@ -126,11 +126,15 @@ def judge_cost(self) -> float: class EvalCase: """Typed container for eval case data in ForEach. + `name` is required: it identifies the case across history, reporting, and + file-based output. Two cases sharing a name collide silently in those + downstream consumers. + Usage:: cases = ForEach([ - EvalCase(inputs="Who is Marie?", expected="Marie, Resistance", name="lookup"), - EvalCase(inputs="Who is Pierre?", expected="Pierre, arrest"), + EvalCase(inputs="Who is Marie?", name="marie_lookup", expected="Marie, Resistance"), + EvalCase(inputs="Who is Pierre?", name="pierre_lookup", expected="Pierre, arrest"), ]) @suite.eval(evaluators=[contains_facts]) @@ -139,13 +143,20 @@ def my_eval(case: Annotated[EvalCase, From(cases)]) -> str: """ inputs: Any + name: str expected: Any = None - name: str = "" evaluators: list[Any] = field(default_factory=list) metadata: dict[str, Any] = field(default_factory=dict) + def __post_init__(self) -> None: + if not self.name: + raise ValueError( + "EvalCase.name must be a non-empty string " + "(used for history tracking and case identity)." + ) + def __repr__(self) -> str: - return self.name or f"EvalCase({self.inputs!r})" + return self.name class ShortCircuit: diff --git a/protest/evals/wrapper.py b/protest/evals/wrapper.py index d4278c0..82b25a8 100644 --- a/protest/evals/wrapper.py +++ b/protest/evals/wrapper.py @@ -124,7 +124,7 @@ def _extract_expected(kwargs: dict[str, Any]) -> Any: def _extract_case_name(kwargs: dict[str, Any], fallback: str) -> str: case = _find_case(kwargs) - if case is None or not case.name: + if case is None: return fallback return case.name diff --git a/tests/evals/test_evalcase.py b/tests/evals/test_evalcase.py new file mode 100644 index 0000000..12435f6 --- /dev/null +++ b/tests/evals/test_evalcase.py @@ -0,0 +1,36 @@ +"""Tests for `EvalCase` construction invariants.""" + +from __future__ import annotations + +import pytest + +from protest.evals import EvalCase + + +class TestEvalCaseRequiresName: + """`name` is required and must be non-empty.""" + + def test_name_required_as_kwarg(self) -> None: + case = EvalCase(inputs="x", name="my_case") + assert case.name == "my_case" + + def test_missing_name_raises(self) -> None: + with pytest.raises(TypeError): + EvalCase(inputs="x") # type: ignore[call-arg] + + def test_empty_name_raises(self) -> None: + with pytest.raises(ValueError, match="non-empty"): + EvalCase(inputs="x", name="") + + def test_name_is_second_positional(self) -> None: + case = EvalCase("input_val", "case_name") + assert case.inputs == "input_val" + assert case.name == "case_name" + + +class TestEvalCaseRepr: + """`__repr__` returns the name (no fallback anymore since name is required).""" + + def test_repr_is_name(self) -> None: + case = EvalCase(inputs="x", name="readable_name") + assert repr(case) == "readable_name" diff --git a/tests/test_history_changes.py b/tests/test_history_changes.py new file mode 100644 index 0000000..a24698e --- /dev/null +++ b/tests/test_history_changes.py @@ -0,0 +1,128 @@ +"""Tests for `_classify_changes` — diffing logic for `protest history --compare`. + +Each case entry is a minimal dict mirroring what `_all_cases(entry)` returns +from a history JSONL record: at least `passed`, optionally `case_hash` and +`eval_hash`. +""" + +from __future__ import annotations + +from protest.cli.history import _classify_changes + + +def _case( + *, + passed: bool = True, + case_hash: str | None = None, + eval_hash: str | None = None, +) -> dict: + entry: dict = {"passed": passed} + if case_hash is not None: + entry["case_hash"] = case_hash + if eval_hash is not None: + entry["eval_hash"] = eval_hash + return entry + + +class TestClassifyChangesDeleted: + """Cases present in `prev` but absent from `curr` land in `deleted`.""" + + def test_single_deletion(self) -> None: + prev = {"case_a": _case(passed=True), "case_b": _case(passed=True)} + curr = {"case_a": _case(passed=True)} + changes = _classify_changes(curr, prev) + assert changes["deleted"] == ["case_b"] + assert changes["new"] == [] + assert changes["fixed"] == [] + assert changes["regressed"] == [] + assert changes["modified"] == [] + + def test_multiple_deletions_preserve_prev_order(self) -> None: + prev = { + "alpha": _case(), + "beta": _case(), + "gamma": _case(), + "delta": _case(), + } + curr = {"alpha": _case()} + changes = _classify_changes(curr, prev) + assert changes["deleted"] == ["beta", "gamma", "delta"] + + def test_deletion_coexists_with_other_changes(self) -> None: + prev = { + "to_delete": _case(passed=True), + "to_fix": _case(passed=False), + "stable": _case(passed=True), + } + curr = { + "to_fix": _case(passed=True), + "stable": _case(passed=True), + "brand_new": _case(passed=True), + } + changes = _classify_changes(curr, prev) + assert changes["deleted"] == ["to_delete"] + assert changes["fixed"] == ["to_fix"] + assert changes["new"] == ["brand_new"] + + def test_all_cases_deleted(self) -> None: + prev = {"a": _case(), "b": _case()} + curr: dict = {} + changes = _classify_changes(curr, prev) + assert changes["deleted"] == ["a", "b"] + assert changes["new"] == [] + + def test_no_deletions(self) -> None: + prev = {"a": _case()} + curr = {"a": _case(), "b": _case()} + changes = _classify_changes(curr, prev) + assert changes["deleted"] == [] + assert changes["new"] == ["b"] + + +class TestClassifyChangesExistingCategories: + """Existing categories keep working after adding `deleted`.""" + + def test_new_case(self) -> None: + changes = _classify_changes({"a": _case()}, {}) + assert changes["new"] == ["a"] + + def test_fixed_case(self) -> None: + prev = {"a": _case(passed=False)} + curr = {"a": _case(passed=True)} + assert _classify_changes(curr, prev)["fixed"] == ["a"] + + def test_regressed_case(self) -> None: + prev = {"a": _case(passed=True)} + curr = {"a": _case(passed=False)} + assert _classify_changes(curr, prev)["regressed"] == ["a"] + + def test_modified_case_hash(self) -> None: + prev = {"a": _case(case_hash="h1")} + curr = {"a": _case(case_hash="h2")} + assert _classify_changes(curr, prev)["modified"] == ["a (case modified)"] + + def test_modified_eval_hash(self) -> None: + prev = {"a": _case(eval_hash="h1")} + curr = {"a": _case(eval_hash="h2")} + assert _classify_changes(curr, prev)["modified"] == ["a (scoring modified)"] + + def test_no_changes(self) -> None: + prev = {"a": _case(passed=True)} + curr = {"a": _case(passed=True)} + changes = _classify_changes(curr, prev) + assert all(not v for v in changes.values()) + + +class TestClassifyChangesResultShape: + """Result dict always has the five expected keys.""" + + def test_empty_inputs_still_yield_five_buckets(self) -> None: + changes = _classify_changes({}, {}) + assert set(changes.keys()) == { + "fixed", + "regressed", + "modified", + "new", + "deleted", + } + assert all(v == [] for v in changes.values()) diff --git a/tests/test_history_cli.py b/tests/test_history_cli.py new file mode 100644 index 0000000..70e81b2 --- /dev/null +++ b/tests/test_history_cli.py @@ -0,0 +1,112 @@ +"""Tests for `protest history` CLI argument parsing. + +Covers mutually-exclusive flag groups: +- Action: `--runs` / `--show` / `--compare` +- Kind: `--evals` / `--tests` + +`handle_history_command(argv)` triggers `SystemExit(2)` from argparse when a +mutex is violated. Tests assert both the exit code and the stderr message +mentioning the conflicting flag. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest + +from protest.cli.history import handle_history_command + +if TYPE_CHECKING: + from pathlib import Path + + +class TestActionMutex: + """`--runs`, `--show`, `--compare` cannot be combined.""" + + @pytest.mark.parametrize( + ("argv", "expected_flag"), + [ + (["--runs", "--compare"], "--compare"), + (["--compare", "--runs"], "--runs"), + (["--runs", "--show", "0"], "--show"), + (["--show", "0", "--runs"], "--runs"), + (["--show", "1", "--compare"], "--compare"), + (["--compare", "--show", "1"], "--show"), + ], + ) + def test_mutex_violation_exits_with_error( + self, + argv: list[str], + expected_flag: str, + capsys: pytest.CaptureFixture[str], + ) -> None: + with pytest.raises(SystemExit) as exc_info: + handle_history_command(argv) + assert exc_info.value.code == 2 + stderr = capsys.readouterr().err + assert "not allowed with argument" in stderr + assert expected_flag in stderr + + +class TestKindMutex: + """`--evals` and `--tests` cannot be combined.""" + + @pytest.mark.parametrize( + "argv", + [ + ["--evals", "--tests"], + ["--tests", "--evals"], + ], + ) + def test_mutex_violation_exits_with_error( + self, + argv: list[str], + capsys: pytest.CaptureFixture[str], + ) -> None: + with pytest.raises(SystemExit) as exc_info: + handle_history_command(argv) + assert exc_info.value.code == 2 + stderr = capsys.readouterr().err + assert "not allowed with argument" in stderr + + +class TestMutexIndependence: + """Flags from different groups can be combined freely.""" + + @pytest.mark.parametrize( + "action_flags", + [ + ["--runs"], + ["--compare"], + ["--show", "0"], + ], + ) + @pytest.mark.parametrize("kind_flag", ["--evals", "--tests"]) + def test_cross_group_combinations_parse_cleanly( + self, + action_flags: list[str], + kind_flag: str, + tmp_path: Path, + capsys: pytest.CaptureFixture[str], + ) -> None: + argv = [*action_flags, kind_flag, "--path", str(tmp_path)] + with pytest.raises(SystemExit) as exc_info: + handle_history_command(argv) + assert exc_info.value.code == 0 + captured = capsys.readouterr() + assert "not allowed with argument" not in captured.err + + +class TestHelpShowsMutex: + """`--help` output surfaces both mutex groups in usage line.""" + + def test_help_output_shows_action_and_kind_groups( + self, capsys: pytest.CaptureFixture[str] + ) -> None: + with pytest.raises(SystemExit) as exc_info: + handle_history_command(["--help"]) + assert exc_info.value.code == 0 + stdout = capsys.readouterr().out + assert "[--runs | --show [N] | --compare]" in stdout + assert "[--evals | --tests]" in stdout From 909ac7219eea73c4674b75b6ae5bb45024b326ad Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Fri, 24 Apr 2026 22:01:44 +0200 Subject: [PATCH 31/60] tests(evals): add tests for `EvalCaseResult.from_test_result` and refactor writer construction - Added comprehensive tests for `EvalCaseResult.from_test_result` to validate field mappings and defensive checks. - Refactored writer logic to use `EvalCaseResult.from_test_result`, simplifying redundant helper methods. --- protest/core/runner.py | 36 +------ protest/evals/results_writer.py | 32 +------ protest/evals/types.py | 43 ++++++++- tests/evals/test_eval_case_result.py | 135 +++++++++++++++++++++++++++ 4 files changed, 183 insertions(+), 63 deletions(-) create mode 100644 tests/evals/test_eval_case_result.py diff --git a/protest/core/runner.py b/protest/core/runner.py index f6bab5b..d25fb47 100644 --- a/protest/core/runner.py +++ b/protest/core/runner.py @@ -17,7 +17,7 @@ SessionSetupInfo, TestCounts, ) -from protest.evals.types import EvalCaseResult, EvalScore, EvalSuiteReport +from protest.evals.types import EvalCaseResult, EvalSuiteReport from protest.events.types import Event from protest.execution.capture import ( GlobalCapturePatch, @@ -76,7 +76,7 @@ def _collect_eval_result(self, result: TestResult) -> None: if not result.is_eval or result.eval_payload is None: return suite_name = result.suite_path.root_name if result.suite_path else "evals" - case_result = _build_eval_case_result(result) + case_result = EvalCaseResult.from_test_result(result) self._eval_results.setdefault(suite_name, []).append(case_result) async def _main_loop(self) -> bool: # noqa: PLR0915 @@ -204,35 +204,3 @@ async def _emit_eval_suite_end(self, suite_path: Any) -> None: duration=sum(c.duration for c in eval_cases), ) await self._session.events.emit(Event.EVAL_SUITE_END, report) - - -def _build_eval_case_result(result: TestResult) -> EvalCaseResult: - """Build EvalCaseResult from a TestResult with eval_payload.""" - payload = result.eval_payload - assert payload is not None - return EvalCaseResult( - case_name=payload.case_name or "", - node_id=result.node_id, - scores=tuple( - EvalScore( - name=name, - value=entry.value, - ) - for name, entry in payload.scores.items() - ), - duration=payload.task_duration, - passed=not (result.error is not None or not payload.passed), - inputs=payload.inputs, - output=payload.output, - expected_output=payload.expected_output, - case_hash=payload.case_hash, - eval_hash=payload.eval_hash, - task_input_tokens=payload.task_input_tokens, - task_output_tokens=payload.task_output_tokens, - task_cost=payload.task_cost, - judge_call_count=payload.judge_call_count, - judge_input_tokens=payload.judge_input_tokens, - judge_output_tokens=payload.judge_output_tokens, - judge_cost=payload.judge_cost, - is_error=result.is_fixture_error, - ) diff --git a/protest/evals/results_writer.py b/protest/evals/results_writer.py index 67ca569..db64f0e 100644 --- a/protest/evals/results_writer.py +++ b/protest/evals/results_writer.py @@ -38,16 +38,16 @@ def activate(cls, ctx: PluginContext) -> EvalResultsWriter: return cls(history_dir=ctx.get("history_dir")) def on_test_pass(self, result: TestResult) -> None: - self._maybe_write(result, passed=True) + self._maybe_write(result) def on_test_fail(self, result: TestResult) -> None: - self._maybe_write(result, passed=False) + self._maybe_write(result) - def _maybe_write(self, result: TestResult, *, passed: bool) -> None: + def _maybe_write(self, result: TestResult) -> None: if not result.is_eval or result.eval_payload is None: return suite_name = result.suite_path.root_name if result.suite_path else "evals" - case_result = _build_case_result(result, passed) + case_result = EvalCaseResult.from_test_result(result) self._write_case_file(case_result, suite_name) def _write_case_file(self, case_result: EvalCaseResult, suite_name: str) -> None: @@ -65,30 +65,6 @@ def on_eval_suite_end(self, report: Any) -> None: print(f" Results: {run_dir}") -def _build_case_result(result: TestResult, passed: bool) -> EvalCaseResult: - """Build EvalCaseResult from a TestResult with eval_payload.""" - payload = result.eval_payload - assert payload is not None - return EvalCaseResult( - case_name=payload.case_name or "", - node_id=result.node_id, - scores=tuple( - EvalScore( - name=name, - value=entry.value, - ) - for name, entry in payload.scores.items() - ), - duration=payload.task_duration, - passed=passed, - inputs=payload.inputs, - output=payload.output, - expected_output=payload.expected_output, - case_hash=payload.case_hash, - eval_hash=payload.eval_hash, - ) - - # --------------------------------------------------------------------------- # File writing helpers # --------------------------------------------------------------------------- diff --git a/protest/evals/types.py b/protest/evals/types.py index 7a2c19a..e78e33c 100644 --- a/protest/evals/types.py +++ b/protest/evals/types.py @@ -4,7 +4,10 @@ import statistics from dataclasses import dataclass, field -from typing import Any, Generic, Protocol, TypeVar, runtime_checkable +from typing import TYPE_CHECKING, Any, Generic, Protocol, TypeVar, runtime_checkable + +if TYPE_CHECKING: + from protest.entities.events import TestResult T = TypeVar("T") @@ -173,6 +176,44 @@ class EvalCaseResult: judge_cost: float = 0.0 is_error: bool = False + @classmethod + def from_test_result(cls, result: TestResult) -> EvalCaseResult: + """Build from a `TestResult` carrying an `eval_payload`. + + `passed` is derived from `result.error` and `payload.passed`, so both + the runner (post-execution) and the results writer (pass/fail hooks) + agree on the same computation. + """ + payload = result.eval_payload + if payload is None: + raise ValueError( + f"Cannot build EvalCaseResult from TestResult without " + f"eval_payload (node_id={result.node_id})" + ) + return cls( + case_name=payload.case_name or "", + node_id=result.node_id, + scores=tuple( + EvalScore(name=name, value=entry.value) + for name, entry in payload.scores.items() + ), + duration=payload.task_duration, + passed=result.error is None and payload.passed, + inputs=payload.inputs, + output=payload.output, + expected_output=payload.expected_output, + case_hash=payload.case_hash, + eval_hash=payload.eval_hash, + task_input_tokens=payload.task_input_tokens, + task_output_tokens=payload.task_output_tokens, + task_cost=payload.task_cost, + judge_call_count=payload.judge_call_count, + judge_input_tokens=payload.judge_input_tokens, + judge_output_tokens=payload.judge_output_tokens, + judge_cost=payload.judge_cost, + is_error=result.is_fixture_error, + ) + @property def numeric_scores(self) -> dict[str, float]: return {s.name: float(s.value) for s in self.scores if s.is_metric} diff --git a/tests/evals/test_eval_case_result.py b/tests/evals/test_eval_case_result.py new file mode 100644 index 0000000..06471eb --- /dev/null +++ b/tests/evals/test_eval_case_result.py @@ -0,0 +1,135 @@ +"""Tests for `EvalCaseResult.from_test_result`. + +This classmethod is the single constructor used by both the runner (post- +execution) and the results writer (pass/fail hooks). The test below pins the +full field mapping so that future additions to `EvalPayload` or `TestResult` +either update the classmethod or break the test. +""" + +from __future__ import annotations + +import pytest + +from protest.entities.events import EvalPayload, EvalScoreEntry, TestResult +from protest.evals.types import EvalCaseResult + + +def _make_payload(**overrides: object) -> EvalPayload: + defaults: dict[str, object] = { + "case_name": "case_one", + "passed": True, + "task_duration": 0.123, + "inputs": "in", + "output": "out", + "expected_output": "expected", + "scores": {"accuracy": EvalScoreEntry(value=0.9, passed=True)}, + "case_hash": "ch", + "eval_hash": "eh", + "task_input_tokens": 100, + "task_output_tokens": 200, + "task_cost": 0.01, + "judge_call_count": 1, + "judge_input_tokens": 50, + "judge_output_tokens": 30, + "judge_cost": 0.005, + } + defaults.update(overrides) + return EvalPayload(**defaults) # type: ignore[arg-type] + + +def _make_result( + *, + error: Exception | None = None, + is_fixture_error: bool = False, + payload: EvalPayload | None = None, + node_id: str = "suite::case_one", +) -> TestResult: + return TestResult( + name="case_one", + node_id=node_id, + error=error, + is_fixture_error=is_fixture_error, + is_eval=True, + eval_payload=payload or _make_payload(), + ) + + +class TestFromTestResultHappyPath: + """Full field mapping: all payload + result fields land in the result.""" + + def test_all_fields_copied(self) -> None: + result = _make_result() + case = EvalCaseResult.from_test_result(result) + assert case.case_name == "case_one" + assert case.node_id == "suite::case_one" + assert case.duration == pytest.approx(0.123) + assert case.passed is True + assert case.inputs == "in" + assert case.output == "out" + assert case.expected_output == "expected" + assert case.case_hash == "ch" + assert case.eval_hash == "eh" + assert case.is_error is False + + def test_scores_converted_from_entries(self) -> None: + case = EvalCaseResult.from_test_result(_make_result()) + assert len(case.scores) == 1 + assert case.scores[0].name == "accuracy" + assert case.scores[0].value == 0.9 + + def test_task_usage_copied(self) -> None: + """Regression: writer used to drop these fields silently.""" + case = EvalCaseResult.from_test_result(_make_result()) + assert case.task_input_tokens == 100 + assert case.task_output_tokens == 200 + assert case.task_cost == pytest.approx(0.01) + + def test_judge_usage_copied(self) -> None: + """Regression: writer used to drop these fields silently.""" + case = EvalCaseResult.from_test_result(_make_result()) + assert case.judge_call_count == 1 + assert case.judge_input_tokens == 50 + assert case.judge_output_tokens == 30 + assert case.judge_cost == pytest.approx(0.005) + + +class TestFromTestResultPassedDerivation: + """`passed` is derived, not passed in — the writer no longer gets it wrong.""" + + def test_passed_when_no_error_and_payload_passed(self) -> None: + result = _make_result(payload=_make_payload(passed=True)) + assert EvalCaseResult.from_test_result(result).passed is True + + def test_failed_when_payload_not_passed(self) -> None: + result = _make_result(payload=_make_payload(passed=False)) + assert EvalCaseResult.from_test_result(result).passed is False + + def test_failed_when_error_present(self) -> None: + result = _make_result( + error=RuntimeError("boom"), + payload=_make_payload(passed=True), + ) + assert EvalCaseResult.from_test_result(result).passed is False + + def test_is_error_reflects_fixture_error(self) -> None: + result = _make_result( + error=RuntimeError("fx"), + is_fixture_error=True, + ) + case = EvalCaseResult.from_test_result(result) + assert case.is_error is True + assert case.passed is False + + +class TestFromTestResultErrors: + """Defensive: classmethod refuses a TestResult without eval_payload.""" + + def test_missing_payload_raises(self) -> None: + result = TestResult( + name="n", + node_id="x", + is_eval=False, + eval_payload=None, + ) + with pytest.raises(ValueError, match="eval_payload"): + EvalCaseResult.from_test_result(result) From fee2bf6a332d95d1af6155ad9fdc75dbea5f62b1 Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Fri, 24 Apr 2026 23:14:46 +0200 Subject: [PATCH 32/60] tests(evals): add tests for `EvalCase.metadata['tags']` wiring and enhance tag propagation logic - Added tests to verify that `EvalCase.metadata['tags']` are merged into `TestItem.tags`. - Updated `Collector` to propagate tags from `EvalCase.metadata` into `TestItem` during collection. - Verified end-to-end integration with `TagFilterPlugin` for tag-based filtering functionality. --- protest/core/collector.py | 10 ++- tests/evals/test_evalcase_tags_wiring.py | 96 ++++++++++++++++++++++++ 2 files changed, 105 insertions(+), 1 deletion(-) create mode 100644 tests/evals/test_evalcase_tags_wiring.py diff --git a/protest/core/collector.py b/protest/core/collector.py index d7c83db..72743e1 100644 --- a/protest/core/collector.py +++ b/protest/core/collector.py @@ -9,6 +9,7 @@ from protest.di.markers import Use from protest.di.validation import _extract_from_params from protest.entities import FixtureCallable, SuitePath, TestItem, TestRegistration +from protest.evals.evaluator import EvalCase if TYPE_CHECKING: from collections.abc import Callable @@ -176,11 +177,18 @@ def _expand_registration( sources[index].get_id(value) for index, value in enumerate(combination) ] + item_tags = tags.copy() + for value in combination: + if isinstance(value, EvalCase): + case_tags = value.metadata.get("tags") + if case_tags: + item_tags.update(case_tags) + items.append( TestItem( func=test_reg.func, suite=suite, - tags=tags.copy(), + tags=item_tags, case_kwargs=case_kwargs, case_ids=case_ids, skip=test_reg.skip, diff --git a/tests/evals/test_evalcase_tags_wiring.py b/tests/evals/test_evalcase_tags_wiring.py new file mode 100644 index 0000000..dbf9649 --- /dev/null +++ b/tests/evals/test_evalcase_tags_wiring.py @@ -0,0 +1,96 @@ +"""Tests for `EvalCase.metadata['tags']` → `TestItem.tags` wiring. + +Verifies that tags declared on an `EvalCase` via `metadata={'tags': [...]}` +are merged into the resulting `TestItem.tags` set, so that the +`TagFilterPlugin` (which filters on `TestItem.tags`) can honor them. + +Eval functions are defined at module level to avoid `get_type_hints()` +resolution issues that occur with nested function definitions. +""" + +from __future__ import annotations + +from typing import Annotated + +from protest import ForEach, From, ProTestSession +from protest.core.collector import Collector +from protest.evals import EvalCase +from protest.evals.suite import EvalSuite +from protest.tags.plugin import TagFilterPlugin + +# Module-level case sources so `get_type_hints()` can resolve Annotated args. +_single_tagged = [EvalCase(inputs="x", name="c1", metadata={"tags": ["safety"]})] +_multi_tagged = [ + EvalCase(inputs="x", name="c1", metadata={"tags": ["safety", "factual"]}) +] +_mixed_cases = [ + EvalCase(inputs="x", name="c1", metadata={"tags": ["safety"]}), + EvalCase(inputs="y", name="c2", metadata={"tags": ["factual"]}), + EvalCase(inputs="z", name="c3"), +] +_no_tags_metadata = [ + EvalCase(inputs="x", name="c1", metadata={"other": "value"}), +] +_filter_cases = [ + EvalCase(inputs="a", name="c_safety", metadata={"tags": ["safety"]}), + EvalCase(inputs="b", name="c_factual", metadata={"tags": ["factual"]}), +] + + +def _collect(cases: list[EvalCase]) -> list: + """Build a session with a parametrized eval over `cases` and collect items.""" + session = ProTestSession() + suite = EvalSuite("evals") + + source = ForEach(cases) + + @suite.eval() + def my_eval(case: Annotated[EvalCase, From(source)]) -> str: + return str(case.inputs) + + _ = my_eval # silence unused-var diagnostics; decorator registers it + session.add_suite(suite) + return Collector().collect(session) + + +class TestCaseTagsMergedIntoItemTags: + def test_single_case_tag_becomes_item_tag(self) -> None: + items = _collect(_single_tagged) + assert len(items) == 1 + assert "safety" in items[0].tags + + def test_multiple_case_tags(self) -> None: + items = _collect(_multi_tagged) + assert items[0].tags >= {"safety", "factual"} + + def test_cases_get_distinct_tags(self) -> None: + items = _collect(_mixed_cases) + assert len(items) == 3 + by_name = {item.case_ids[0]: item for item in items} + assert "safety" in by_name["c1"].tags + assert "factual" not in by_name["c1"].tags + assert "factual" in by_name["c2"].tags + assert "safety" not in by_name["c2"].tags + assert by_name["c3"].tags == set() + + def test_case_without_tags_metadata_ok(self) -> None: + items = _collect(_no_tags_metadata) + assert items[0].tags == set() + + +class TestTagFilterHonorsCaseTags: + """End-to-end: `TagFilterPlugin` filters items based on case tags.""" + + def test_include_tag_keeps_matching_cases(self) -> None: + items = _collect(_filter_cases) + plugin = TagFilterPlugin(include_tags={"safety"}) + filtered = plugin.on_collection_finish(items) + assert len(filtered) == 1 + assert filtered[0].case_ids == ["c_safety"] + + def test_exclude_tag_drops_matching_cases(self) -> None: + items = _collect(_filter_cases) + plugin = TagFilterPlugin(exclude_tags={"safety"}) + filtered = plugin.on_collection_finish(items) + assert len(filtered) == 1 + assert filtered[0].case_ids == ["c_factual"] From 46c54d377f598f843d49d21f746e27699391c415 Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Fri, 24 Apr 2026 23:50:18 +0200 Subject: [PATCH 33/60] tests(history): add concurrency tests for `append_entry` and implement cross-platform file locking - Added tests to ensure `append_entry` supports concurrent writes without line corruption. - Implemented cross-platform file locking: `fcntl.flock` on POSIX and `msvcrt.locking` on Windows using a sibling `.lock` file. - Ensured single-writer and concurrency invariants for parseable JSON lines in history files. --- protest/history/storage.py | 63 +++++++++++-- tests/history/__init__.py | 0 .../history/test_append_entry_concurrency.py | 90 +++++++++++++++++++ 3 files changed, 146 insertions(+), 7 deletions(-) create mode 100644 tests/history/__init__.py create mode 100644 tests/history/test_append_entry_concurrency.py diff --git a/protest/history/storage.py b/protest/history/storage.py index 5dbe047..829b65e 100644 --- a/protest/history/storage.py +++ b/protest/history/storage.py @@ -2,10 +2,57 @@ from __future__ import annotations +import contextlib import json import subprocess +import sys from pathlib import Path -from typing import Any +from typing import IO, TYPE_CHECKING, Any + +if TYPE_CHECKING: + from collections.abc import Iterator + +if sys.platform == "win32": + import msvcrt + + @contextlib.contextmanager + def _exclusive_file_lock(f: IO[Any]) -> Iterator[None]: + """Hold an exclusive advisory lock on `f` for the block's duration. + + Windows `msvcrt.locking` cannot lock regions beyond EOF, so we lock + a sibling `.lock` file that we ensure always has 1 byte. All + writers cooperate on this sibling, so concurrent appends to the + main file are serialized. + """ + lock_path = Path(f"{f.name}.lock") + with open(lock_path, "a+b") as lf: + lf.seek(0, 2) + if lf.tell() == 0: + lf.write(b"\0") + lf.flush() + lf.seek(0) + msvcrt.locking(lf.fileno(), msvcrt.LK_LOCK, 1) + try: + yield + finally: + lf.seek(0) + msvcrt.locking(lf.fileno(), msvcrt.LK_UNLCK, 1) +else: + import fcntl + + @contextlib.contextmanager + def _exclusive_file_lock(f: IO[Any]) -> Iterator[None]: + """Hold an exclusive advisory lock on `f` for the block's duration. + + POSIX `fcntl.flock` locks the file descriptor directly; cross-process + callers opening the same path will block until the lock is released. + """ + fcntl.flock(f.fileno(), fcntl.LOCK_EX) + try: + yield + finally: + fcntl.flock(f.fileno(), fcntl.LOCK_UN) + DEFAULT_HISTORY_DIR = Path(".protest") HISTORY_FILE = "history.jsonl" @@ -64,14 +111,16 @@ def _has_suite_kind(entry: dict[str, Any], kind: str) -> bool: def append_entry(path: Path, entry: dict[str, Any]) -> None: """Append a single JSON entry to a JSONL file. - Note: no file locking — concurrent writes from separate processes - could corrupt the file. In practice, protest runs are single-process - (async workers share the same process). If concurrent CI jobs write - to the same history file, consider using separate history_dir per job. + Serializes concurrent writes from separate processes sharing the same + history file (e.g. a CI matrix) via an exclusive advisory lock: + `fcntl.flock` on POSIX, `msvcrt.locking` on a sibling `.lock` + file on Windows. """ path.parent.mkdir(parents=True, exist_ok=True) - with open(path, "a") as f: - f.write(json.dumps(entry, default=str) + "\n") + line = json.dumps(entry, default=str) + "\n" + with open(path, "a") as f, _exclusive_file_lock(f): + f.write(line) + f.flush() def load_previous_run( diff --git a/tests/history/__init__.py b/tests/history/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/history/test_append_entry_concurrency.py b/tests/history/test_append_entry_concurrency.py new file mode 100644 index 0000000..5bd3d79 --- /dev/null +++ b/tests/history/test_append_entry_concurrency.py @@ -0,0 +1,90 @@ +"""Tests for `append_entry` — concurrent writer safety. + +Covers the basic invariant (one entry = one parseable line) and the +multiprocess-concurrency case: N workers append concurrently to the same +file; every line must be parseable JSON. Without locking, interleaved +writes larger than `PIPE_BUF` would corrupt lines and the test would fail. +""" + +from __future__ import annotations + +import json +import multiprocessing as mp +from pathlib import Path + +from protest.history.storage import append_entry + + +def _worker_append(args: tuple[str, int, int]) -> None: + """Child-process entry: append `count` entries, each padded to ~5 KB. + + The padding pushes the write past PIPE_BUF (4 KB) so that without a + lock the POSIX O_APPEND atomicity guarantee no longer applies. + """ + path_str, worker_id, count = args + path = Path(path_str) + padding = "x" * 5000 + for i in range(count): + append_entry(path, {"worker": worker_id, "i": i, "pad": padding}) + + +class TestAppendEntryBasic: + """Single-writer invariants.""" + + def test_creates_parent_dir(self, tmp_path: Path) -> None: + target = tmp_path / "nested" / "history.jsonl" + append_entry(target, {"k": "v"}) + assert target.exists() + assert target.parent.is_dir() + + def test_appends_one_line_per_call(self, tmp_path: Path) -> None: + path = tmp_path / "history.jsonl" + append_entry(path, {"a": 1}) + append_entry(path, {"b": 2}) + lines = path.read_text().splitlines() + assert len(lines) == 2 + assert json.loads(lines[0]) == {"a": 1} + assert json.loads(lines[1]) == {"b": 2} + + def test_default_str_serializes_non_json_types(self, tmp_path: Path) -> None: + """`json.dumps(..., default=str)` handles non-serializable values.""" + path = tmp_path / "history.jsonl" + + class Marker: + def __str__(self) -> str: + return "marker-str" + + append_entry(path, {"obj": Marker()}) + (line,) = path.read_text().splitlines() + assert json.loads(line) == {"obj": "marker-str"} + + +class TestAppendEntryConcurrency: + """Multi-process concurrent appends produce N parseable lines.""" + + def test_concurrent_writers_do_not_interleave(self, tmp_path: Path) -> None: + path = tmp_path / "history.jsonl" + workers = 8 + per_worker = 5 + total = workers * per_worker + + ctx = mp.get_context("spawn") + with ctx.Pool(workers) as pool: + pool.map( + _worker_append, + [(str(path), wid, per_worker) for wid in range(workers)], + ) + + lines = path.read_text().splitlines() + assert len(lines) == total, ( + f"expected {total} lines, got {len(lines)} — some writes were lost" + ) + + counts_per_worker: dict[int, int] = {} + for raw in lines: + entry = json.loads(raw) # raises JSONDecodeError on interleaved bytes + counts_per_worker[entry["worker"]] = ( + counts_per_worker.get(entry["worker"], 0) + 1 + ) + + assert counts_per_worker == dict.fromkeys(range(workers), per_worker) From f2909b2220fb45c3082b94e397f3443461b8f7d2 Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Sat, 25 Apr 2026 00:02:41 +0200 Subject: [PATCH 34/60] tests(history): add isolation tests for `DEFAULT_HISTORY_DIR` and override behaviors - Added regression tests to ensure `_isolate_protest_history` fixture correctly overrides `DEFAULT_HISTORY_DIR` with a per-test temp directory. - Verified that `HistoryPlugin` respects explicit `history_dir` values while defaulting to the overridden directory. - Updated `conftest.py` with autouse fixture to prevent test pollution of real `.protest/history.jsonl`. --- protest/history/plugin.py | 4 +- tests/conftest.py | 17 ++++++++ tests/history/test_history_dir_isolation.py | 45 +++++++++++++++++++++ 3 files changed, 64 insertions(+), 2 deletions(-) create mode 100644 tests/history/test_history_dir_isolation.py diff --git a/protest/history/plugin.py b/protest/history/plugin.py index 92b7942..bf19f0a 100644 --- a/protest/history/plugin.py +++ b/protest/history/plugin.py @@ -8,9 +8,9 @@ from protest.entities import SuiteKind from protest.evals.suite import EvalSuite +from protest.history import storage from protest.history.collector import collect_env_info, collect_git_info from protest.history.storage import ( - DEFAULT_HISTORY_DIR, HISTORY_FILE, append_entry, load_previous_run, @@ -38,7 +38,7 @@ class HistoryPlugin(PluginBase): description = "Run history tracking" def __init__(self, history_dir: Path | None = None) -> None: - self._history_dir = history_dir or DEFAULT_HISTORY_DIR + self._history_dir = history_dir or storage.DEFAULT_HISTORY_DIR self._history_file = self._history_dir / HISTORY_FILE # Test data self._test_suites: dict[str, dict[str, dict[str, Any]]] = {} diff --git a/tests/conftest.py b/tests/conftest.py index 5e14ed2..a40d851 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -13,11 +13,28 @@ TestItem, TestResult, ) +from protest.history import storage as history_storage from protest.plugin import PluginBase from tests.factories.test_items import make_test_item if TYPE_CHECKING: from collections.abc import Callable + from pathlib import Path + + +@pytest.fixture(autouse=True) +def _isolate_protest_history(tmp_path: "Path", monkeypatch: pytest.MonkeyPatch) -> None: + """Redirect `DEFAULT_HISTORY_DIR` to a per-test temp dir. + + Tests that forget to pass `history_dir=tmp_path` would otherwise write + into the repo's real `.protest/history.jsonl`. The monkeypatch targets + the single source of truth (`storage.DEFAULT_HISTORY_DIR`) — all + consumers access it via the module so the override is seen everywhere. + + Tests that pass an explicit `history_dir` still use that value, because + the plugin does `history_dir or storage.DEFAULT_HISTORY_DIR`. + """ + monkeypatch.setattr(history_storage, "DEFAULT_HISTORY_DIR", tmp_path / ".protest") @pytest.fixture diff --git a/tests/history/test_history_dir_isolation.py b/tests/history/test_history_dir_isolation.py new file mode 100644 index 0000000..26946ac --- /dev/null +++ b/tests/history/test_history_dir_isolation.py @@ -0,0 +1,45 @@ +"""Regression tests for B2: tests must not pollute the repo's history file. + +The autouse `_isolate_protest_history` fixture in `tests/conftest.py` +monkeypatches `storage.DEFAULT_HISTORY_DIR` to a per-test temp directory. +These tests assert that both the storage functions and the HistoryPlugin +pick up the override — any regression in the plumbing would let runs leak +into `.protest/history.jsonl` in the real project cwd. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from protest.history import storage +from protest.history.plugin import HistoryPlugin + +if TYPE_CHECKING: + from pathlib import Path + + +class TestDefaultHistoryDirOverride: + """The autouse fixture redirects the module-level constant.""" + + def test_storage_default_points_to_tmp(self, tmp_path: Path) -> None: + assert tmp_path / ".protest" == storage.DEFAULT_HISTORY_DIR + + def test_append_entry_uses_override(self, tmp_path: Path) -> None: + target = storage.DEFAULT_HISTORY_DIR / storage.HISTORY_FILE + storage.append_entry(target, {"k": "v"}) + assert target.exists() + assert target.is_relative_to(tmp_path) + + def test_plugin_default_dir_follows_override(self, tmp_path: Path) -> None: + plugin = HistoryPlugin() + assert plugin._history_dir == tmp_path / ".protest" + assert plugin._history_file.is_relative_to(tmp_path) + + +class TestExplicitHistoryDirWins: + """Explicit `history_dir=` still takes precedence over the override.""" + + def test_plugin_honors_explicit_dir(self, tmp_path: Path) -> None: + explicit = tmp_path / "custom" + plugin = HistoryPlugin(history_dir=explicit) + assert plugin._history_dir == explicit From acdacfdb42ea1dabddf36558fc5c74e1f1cc9779 Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Sat, 25 Apr 2026 00:18:50 +0200 Subject: [PATCH 35/60] tests(execution): add tests for `real_stdout` / `real_stderr` and replace sys stream duck-typing - Added unit tests for `real_stdout` and `real_stderr` to ensure proper unwrapping of `TaskAwareStream` and correct fallback to original streams. - Replaced `getattr(sys.stdout, "_original", ...)` duck-typing with typed accessors across multiple modules for better maintainability and robustness. - Updated console, reporters, and fallback print logic to utilize the new accessors, ensuring consistent bypass of per-test capture layers. --- protest/console.py | 6 ++-- protest/execution/capture.py | 19 ++++++++++++ protest/reporting/ascii.py | 6 ++-- protest/reporting/rich_reporter.py | 6 ++-- tests/execution/test_real_streams.py | 46 ++++++++++++++++++++++++++++ 5 files changed, 73 insertions(+), 10 deletions(-) create mode 100644 tests/execution/test_real_streams.py diff --git a/protest/console.py b/protest/console.py index 9959165..30ee49b 100644 --- a/protest/console.py +++ b/protest/console.py @@ -21,10 +21,9 @@ async def pipeline(): import contextlib import re -import sys from protest.events.types import Event -from protest.execution.capture import get_event_bus +from protest.execution.capture import get_event_bus, real_stderr def print(msg: str, *, raw: bool = False) -> None: @@ -52,8 +51,7 @@ def print(msg: str, *, raw: bool = False) -> None: def _fallback_print(msg: str, raw: bool) -> None: """Fallback when no event bus — write to real stderr (bypassing capture).""" text = msg if raw else strip_markup(msg) - # sys.stderr may be wrapped by TaskAwareStream — get the original - stream = getattr(sys.stderr, "_original", sys.stderr) + stream = real_stderr() stream.write(text + "\n") stream.flush() diff --git a/protest/execution/capture.py b/protest/execution/capture.py index 2e258a7..c5f54c0 100644 --- a/protest/execution/capture.py +++ b/protest/execution/capture.py @@ -148,6 +148,25 @@ def __getattr__(self, name: str) -> object: return getattr(self._original, name) +def real_stdout() -> TextIO: + """Return the real process stdout, bypassing any active capture wrapper. + + When a run is under capture, `sys.stdout` is a `TaskAwareStream` routing + writes into per-test buffers; reporters need to bypass that buffering to + write their own output (progress, summary) directly to the terminal. + """ + if isinstance(sys.stdout, TaskAwareStream): + return sys.stdout._original + return sys.stdout + + +def real_stderr() -> TextIO: + """Return the real process stderr, bypassing any active capture wrapper.""" + if isinstance(sys.stderr, TaskAwareStream): + return sys.stderr._original + return sys.stderr + + class TaskAwareLogHandler(logging.Handler): def emit(self, record: LogRecord) -> None: records = _log_records.get() diff --git a/protest/reporting/ascii.py b/protest/reporting/ascii.py index 64470b8..018bedf 100644 --- a/protest/reporting/ascii.py +++ b/protest/reporting/ascii.py @@ -1,5 +1,4 @@ import logging -import sys import traceback from pathlib import Path from typing import Any @@ -23,6 +22,7 @@ TestTeardownInfo, ) from protest.evals.types import EvalSuiteReport +from protest.execution.capture import real_stdout from protest.plugin import PluginBase, PluginContext from protest.reporting.format import ( format_duration as _format_duration, @@ -200,7 +200,7 @@ def on_test_teardown_start(self, info: TestTeardownInfo) -> None: @staticmethod def _print_bypass(msg: str) -> None: - stream = getattr(sys.stdout, "_original", sys.stdout) + stream = real_stdout() stream.write(msg + "\n") stream.flush() @@ -320,7 +320,7 @@ def _print_failure_detail(self, result: TestResult, *, is_error: bool) -> None: def on_user_print(self, data: Any) -> None: msg, raw = data text = msg if raw else strip_markup(msg) - stream = getattr(sys.stdout, "_original", sys.stdout) + stream = real_stdout() stream.write(f" | {text}\n") stream.flush() diff --git a/protest/reporting/rich_reporter.py b/protest/reporting/rich_reporter.py index 5794457..bf93406 100644 --- a/protest/reporting/rich_reporter.py +++ b/protest/reporting/rich_reporter.py @@ -1,5 +1,4 @@ import logging -import sys import traceback from argparse import ArgumentParser from pathlib import Path @@ -25,6 +24,7 @@ TestTeardownInfo, ) from protest.evals.types import EvalSuiteReport +from protest.execution.capture import real_stdout from protest.plugin import PluginBase, PluginContext from protest.reporting.format import ( format_duration as _format_duration, @@ -152,7 +152,7 @@ def _maybe_show_logs(self, result: TestResult) -> None: def _print_bypass(self, message: str) -> None: """Print bypassing capture (for lifecycle messages emitted during tests).""" - stream = getattr(sys.stdout, "_original", sys.stdout) + stream = real_stdout() Console(file=stream, highlight=False).print(message) def on_collection_finish(self, items: list[TestItem]) -> list[TestItem]: @@ -377,7 +377,7 @@ def _print_failure_detail(self, result: TestResult, *, is_error: bool) -> None: def on_user_print(self, data: Any) -> None: msg, raw = data # Write to the real stdout, bypassing capture - stream = getattr(sys.stdout, "_original", sys.stdout) + stream = real_stdout() c = Console(file=stream, highlight=False) if raw: c.print(msg, markup=False) diff --git a/tests/execution/test_real_streams.py b/tests/execution/test_real_streams.py new file mode 100644 index 0000000..c7b53de --- /dev/null +++ b/tests/execution/test_real_streams.py @@ -0,0 +1,46 @@ +"""Tests for `real_stdout()` / `real_stderr()`. + +These accessors replace the previous `getattr(sys.stdout, "_original", ...)` +duck-typing. They give reporters a typed way to bypass the per-test capture +wrapper, so renaming or removing the private attribute won't silently break +reporter output. +""" + +from __future__ import annotations + +import io +import sys + +from protest.execution.capture import ( + TaskAwareStream, + real_stderr, + real_stdout, +) + + +class TestRealStdoutUnwrapsTaskAwareStream: + def test_returns_stdout_when_not_wrapped(self) -> None: + assert real_stdout() is sys.stdout + + def test_unwraps_wrapped_stream(self) -> None: + buffer = io.StringIO() + wrapper = TaskAwareStream(buffer) + sys.stdout = wrapper # type: ignore[assignment] + try: + assert real_stdout() is buffer + finally: + sys.stdout = sys.__stdout__ + + +class TestRealStderrUnwrapsTaskAwareStream: + def test_returns_stderr_when_not_wrapped(self) -> None: + assert real_stderr() is sys.stderr + + def test_unwraps_wrapped_stream(self) -> None: + buffer = io.StringIO() + wrapper = TaskAwareStream(buffer) + sys.stderr = wrapper # type: ignore[assignment] + try: + assert real_stderr() is buffer + finally: + sys.stderr = sys.__stderr__ From 715857eaddc68c315d5ddbd3efc7ec02613c70a9 Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Sat, 25 Apr 2026 01:07:03 +0200 Subject: [PATCH 36/60] refactor(console, capture): improve type annotations and clarify event bus usage - Added `EventBus` type annotations for `_event_bus_ref` and related methods to improve clarity and type safety. - Updated comments in `console.print` to explain the necessity of private access to `bus._handlers` and its rationale. - Added `TYPE_CHECKING` imports to minimize runtime overhead while maintaining forward references. --- protest/console.py | 10 +++++++--- protest/execution/capture.py | 19 ++++++++++++------- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/protest/console.py b/protest/console.py index 30ee49b..2d31607 100644 --- a/protest/console.py +++ b/protest/console.py @@ -41,9 +41,13 @@ def print(msg: str, *, raw: bool = False) -> None: _fallback_print(msg, raw) return - # Call handlers directly (sync, bypasses async emit). - # This ensures messages appear immediately, not after the test. - for handler_entry in bus._handlers.get(Event.USER_PRINT, []): # type: ignore[attr-defined] + # Intentional private access to `bus._handlers`: we need sync dispatch + # so messages appear immediately (not after the test). An earlier public + # `EventBus.emit_sync` was removed (commit e14ffd5) because its signal- + # handler use case was async-signal-unsafe, and we don't want to offer + # that API to users. Kept private here — the framework itself is the + # only caller, and console.print is never invoked from a signal handler. + for handler_entry in bus._handlers.get(Event.USER_PRINT, []): with contextlib.suppress(Exception): handler_entry.func((msg, raw)) diff --git a/protest/execution/capture.py b/protest/execution/capture.py index c5f54c0..584dbf3 100644 --- a/protest/execution/capture.py +++ b/protest/execution/capture.py @@ -1,14 +1,19 @@ +from __future__ import annotations + import io import logging import sys -from collections.abc import Callable from contextlib import suppress from contextvars import ContextVar, Token from dataclasses import dataclass from logging import LogRecord -from typing import TextIO +from typing import TYPE_CHECKING, TextIO + +if TYPE_CHECKING: + from collections.abc import Callable -from protest.compat import Self + from protest.compat import Self + from protest.events.bus import EventBus _capture_buffer: ContextVar[io.StringIO | None] = ContextVar( "capture_buffer", default=None @@ -19,7 +24,7 @@ ) _current_node_id: ContextVar[str | None] = ContextVar("current_node_id", default=None) -_event_bus_ref: ContextVar[object | None] = ContextVar("event_bus_ref", default=None) +_event_bus_ref: ContextVar[EventBus | None] = ContextVar("event_bus_ref", default=None) @dataclass(slots=True) @@ -101,17 +106,17 @@ def get_session_teardown_output() -> str: return _session_teardown.buffer.getvalue() if _session_teardown.buffer else "" -def set_event_bus(bus: object) -> Token[object | None]: +def set_event_bus(bus: EventBus) -> Token[EventBus | None]: """Set event bus reference for console.print() access.""" return _event_bus_ref.set(bus) -def reset_event_bus(token: Token[object | None]) -> None: +def reset_event_bus(token: Token[EventBus | None]) -> None: """Reset event bus reference.""" _event_bus_ref.reset(token) -def get_event_bus() -> object | None: +def get_event_bus() -> EventBus | None: """Get current event bus (for console.print).""" return _event_bus_ref.get() From 594bb547c0435a2a7f7964bdbde2c603bff2ab44 Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Sat, 25 Apr 2026 09:11:11 +0200 Subject: [PATCH 37/60] feat(history): version JSONL entries via `schema_version` with skip+warn on future versions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - `SCHEMA_VERSION = 1` constant in `storage`; `HistoryPlugin` stamps it on every new entry. - Readers (`load_history`, `load_previous_run`) skip entries whose `schema_version` exceeds the current value, with a one-time warning per version (deduplicated via a module-level set). - Legacy entries (no `schema_version` key) treated as version 0 and read normally — zero migration needed. - Add `tests/history/test_schema_version.py` covering writes, future-version skipping, warn-once behavior, and legacy compat. --- protest/history/plugin.py | 2 + protest/history/storage.py | 34 +++++++++ tests/history/test_schema_version.py | 109 +++++++++++++++++++++++++++ 3 files changed, 145 insertions(+) create mode 100644 tests/history/test_schema_version.py diff --git a/protest/history/plugin.py b/protest/history/plugin.py index bf19f0a..ca738a3 100644 --- a/protest/history/plugin.py +++ b/protest/history/plugin.py @@ -12,6 +12,7 @@ from protest.history.collector import collect_env_info, collect_git_info from protest.history.storage import ( HISTORY_FILE, + SCHEMA_VERSION, append_entry, load_previous_run, ) @@ -194,6 +195,7 @@ def on_session_end(self, result: Any) -> None: } entry: dict[str, Any] = { + "schema_version": SCHEMA_VERSION, "run_id": str(uuid.uuid4()), "timestamp": datetime.now(tz=timezone.utc).isoformat(), "git": collect_git_info(), diff --git a/protest/history/storage.py b/protest/history/storage.py index 829b65e..8f89fa4 100644 --- a/protest/history/storage.py +++ b/protest/history/storage.py @@ -6,6 +6,7 @@ import json import subprocess import sys +import warnings from pathlib import Path from typing import IO, TYPE_CHECKING, Any @@ -57,6 +58,35 @@ def _exclusive_file_lock(f: IO[Any]) -> Iterator[None]: DEFAULT_HISTORY_DIR = Path(".protest") HISTORY_FILE = "history.jsonl" +# JSONL entry schema version. Bump when the on-disk shape changes in a way +# that older readers can't transparently handle (new required fields, +# restructured nesting). Entries written before this was introduced have no +# `schema_version` key and are treated as version 0 (legacy — best-effort). +SCHEMA_VERSION = 1 + +_warned_future_versions: set[int] = set() + + +def _is_future_schema(entry: dict[str, Any]) -> bool: + """Return True if the entry was written by a newer protest version. + + Entries with `schema_version > SCHEMA_VERSION` are skipped by readers, + with a one-time warning per version (avoids N warnings for N such + entries). + """ + version = entry.get("schema_version", 0) + if not isinstance(version, int) or version <= SCHEMA_VERSION: + return False + if version not in _warned_future_versions: + _warned_future_versions.add(version) + warnings.warn( + f"history.jsonl contains entries with schema_version={version}, " + f"but this protest supports up to {SCHEMA_VERSION}. " + f"Those entries will be skipped. Upgrade protest to read them.", + stacklevel=3, + ) + return True + def load_history( history_dir: Path | None = None, @@ -77,6 +107,8 @@ def load_history( entry = json.loads(line) except json.JSONDecodeError: continue + if _is_future_schema(entry): + continue if evals_only and not _has_suite_kind(entry, "eval"): continue if tests_only and not _has_suite_kind(entry, "test"): @@ -137,6 +169,8 @@ def load_previous_run( entry = json.loads(line) except json.JSONDecodeError: continue + if _is_future_schema(entry): + continue if evals_only and entry.get("evals") is None: continue return dict(entry) diff --git a/tests/history/test_schema_version.py b/tests/history/test_schema_version.py new file mode 100644 index 0000000..b4a0724 --- /dev/null +++ b/tests/history/test_schema_version.py @@ -0,0 +1,109 @@ +"""Tests for `schema_version` on history JSONL entries. + +The plugin stamps every new entry with `schema_version`. Readers skip +entries with a future version (written by a newer protest) and warn once +per version. + +Legacy entries (no `schema_version` key at all — written before this was +introduced) are treated as version 0 and read without warning. +""" + +from __future__ import annotations + +import json +import warnings +from typing import TYPE_CHECKING + +from protest.history import storage +from protest.history.plugin import HistoryPlugin + +if TYPE_CHECKING: + from pathlib import Path + + +def _write_jsonl(path: Path, entries: list[dict]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("\n".join(json.dumps(e) for e in entries) + "\n") + + +class TestSchemaVersionWrites: + def test_append_entry_writes_schema_version_via_plugin(self) -> None: + """HistoryPlugin stamps `schema_version` on every new entry.""" + plugin = HistoryPlugin() + assert storage.SCHEMA_VERSION >= 1 + + entry_with_version = {"schema_version": storage.SCHEMA_VERSION, "k": "v"} + storage.append_entry(plugin._history_file, entry_with_version) + loaded = json.loads(plugin._history_file.read_text().splitlines()[0]) + assert loaded["schema_version"] == storage.SCHEMA_VERSION + + +class TestFutureVersionSkipped: + def test_future_version_is_skipped_by_load_history(self, tmp_path: Path) -> None: + path = tmp_path / ".protest" / storage.HISTORY_FILE + _write_jsonl( + path, + [ + {"schema_version": storage.SCHEMA_VERSION, "run_id": "current"}, + {"schema_version": storage.SCHEMA_VERSION + 10, "run_id": "future"}, + ], + ) + storage._warned_future_versions.clear() + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + entries = storage.load_history(history_dir=tmp_path / ".protest") + run_ids = [e["run_id"] for e in entries] + assert run_ids == ["current"] + + def test_future_version_is_skipped_by_load_previous_run( + self, tmp_path: Path + ) -> None: + path = tmp_path / ".protest" / storage.HISTORY_FILE + _write_jsonl( + path, + [ + {"schema_version": storage.SCHEMA_VERSION, "run_id": "older"}, + {"schema_version": storage.SCHEMA_VERSION + 1, "run_id": "newer"}, + ], + ) + storage._warned_future_versions.clear() + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + entry = storage.load_previous_run(history_dir=tmp_path / ".protest") + assert entry is not None + assert entry["run_id"] == "older" + + def test_warning_raised_once_per_future_version(self, tmp_path: Path) -> None: + path = tmp_path / ".protest" / storage.HISTORY_FILE + future = storage.SCHEMA_VERSION + 42 + _write_jsonl( + path, + [{"schema_version": future, "run_id": str(i)} for i in range(5)], + ) + storage._warned_future_versions.clear() + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + storage.load_history(history_dir=tmp_path / ".protest") + future_warnings = [ + w for w in caught if f"schema_version={future}" in str(w.message) + ] + assert len(future_warnings) == 1 + + +class TestLegacyEntriesStillReadable: + """Pre-schema_version entries have no key — treat as legacy (version 0).""" + + def test_entry_without_schema_version_is_read(self, tmp_path: Path) -> None: + path = tmp_path / ".protest" / storage.HISTORY_FILE + _write_jsonl(path, [{"run_id": "legacy", "suites": {}}]) + storage._warned_future_versions.clear() + entries = storage.load_history(history_dir=tmp_path / ".protest") + assert len(entries) == 1 + assert entries[0]["run_id"] == "legacy" + + def test_entry_with_version_zero_is_read(self, tmp_path: Path) -> None: + path = tmp_path / ".protest" / storage.HISTORY_FILE + _write_jsonl(path, [{"schema_version": 0, "run_id": "v0"}]) + storage._warned_future_versions.clear() + entries = storage.load_history(history_dir=tmp_path / ".protest") + assert len(entries) == 1 From 4276e5d123ebdbef5abf0d4f87eedb2e14b68bc3 Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Sat, 25 Apr 2026 09:19:11 +0200 Subject: [PATCH 38/60] fix(evals): use `statistics.quantiles` for true p5/p95 in `ScoreStats` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces naive `int(n * 0.05)` index lookup that collapsed p5/p95 to min/max for small samples (the typical eval case: n=10 returned sv[0]/sv[9]). Now uses `statistics.quantiles(n=20, method='inclusive')` which interpolates linearly between adjacent values and clamps to [min, max] — appropriate for bounded scores. - Single-value case (n=1) falls back to that value (percentiles undefined). - Empty case unchanged: zeroed stats. - `_MIN_VALUES_FOR_PERCENTILES = 2` constant gates the quantiles call. - Add `tests/evals/test_score_stats.py` covering empty / n=1 / n=2 / n=10 (the regression case) / n=100 / sort-independence. --- protest/evals/types.py | 18 ++++++- tests/evals/test_score_stats.py | 92 +++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+), 2 deletions(-) create mode 100644 tests/evals/test_score_stats.py diff --git a/protest/evals/types.py b/protest/evals/types.py index e78e33c..141047b 100644 --- a/protest/evals/types.py +++ b/protest/evals/types.py @@ -223,6 +223,9 @@ def failed_scores(self) -> tuple[EvalScore, ...]: return tuple(s for s in self.scores if not s.passed) +_MIN_VALUES_FOR_PERCENTILES = 2 # statistics.quantiles requires at least 2 inputs + + @dataclass(frozen=True, slots=True) class ScoreStats: """Aggregated statistics for a named score across cases.""" @@ -242,12 +245,23 @@ def from_values(cls, name: str, values: list[float]) -> ScoreStats: return cls(name=name, mean=0, median=0, p5=0, p95=0, min=0, max=0, count=0) sv = sorted(values) n = len(sv) + if n >= _MIN_VALUES_FOR_PERCENTILES: + # `quantiles(n=20, method='inclusive')` returns 19 cutpoints that + # split the data into 20 equal groups. Index 0 = 5%, index 18 = 95%. + # Inclusive method interpolates linearly between adjacent values + # and clamps to [min, max] — appropriate for bounded scores. + cuts = statistics.quantiles(sv, n=20, method="inclusive") + p5_value = cuts[0] + p95_value = cuts[18] + else: + # Single value: percentiles are undefined; fall back to that value. + p5_value = p95_value = sv[0] return cls( name=name, mean=statistics.mean(sv), median=statistics.median(sv), - p5=sv[max(0, int(n * 0.05))], - p95=sv[min(n - 1, int(n * 0.95))], + p5=p5_value, + p95=p95_value, min=sv[0], max=sv[-1], count=n, diff --git a/tests/evals/test_score_stats.py b/tests/evals/test_score_stats.py new file mode 100644 index 0000000..7a0eb90 --- /dev/null +++ b/tests/evals/test_score_stats.py @@ -0,0 +1,92 @@ +"""Tests for `ScoreStats.from_values` — percentile correctness. + +Pre-M11, p5/p95 used `int(n * 0.05)` index lookup, which collapses to +min/max for small samples (the typical eval case). Post-M11 uses +`statistics.quantiles(method='inclusive')` for true linear-interpolated +percentiles. These tests pin the new behavior. +""" + +from __future__ import annotations + +import pytest + +from protest.evals.types import ScoreStats + + +class TestEmptyAndSingleValue: + def test_empty_returns_zeroed_stats(self) -> None: + stats = ScoreStats.from_values("acc", []) + assert stats.count == 0 + assert stats.mean == 0 + assert stats.p5 == 0 + assert stats.p95 == 0 + assert stats.min == 0 + assert stats.max == 0 + + def test_single_value_collapses_percentiles(self) -> None: + """One value → percentiles undefined; fall back to that value.""" + stats = ScoreStats.from_values("acc", [0.42]) + assert stats.count == 1 + assert stats.mean == pytest.approx(0.42) + assert stats.median == pytest.approx(0.42) + assert stats.p5 == pytest.approx(0.42) + assert stats.p95 == pytest.approx(0.42) + assert stats.min == pytest.approx(0.42) + assert stats.max == pytest.approx(0.42) + + +class TestPercentilesNotCollapsedForSmallSamples: + """Regression: with n=10 the old impl returned min/max for p5/p95.""" + + def test_n_equals_10_p5_is_above_min(self) -> None: + values = [float(i) for i in range(10)] # 0..9 + stats = ScoreStats.from_values("acc", values) + # Inclusive method interpolates: p5 of [0..9] is 0.45, p95 is 8.55 + assert stats.min == 0 + assert stats.p5 > stats.min + assert stats.p5 == pytest.approx(0.45, abs=0.01) + + def test_n_equals_10_p95_is_below_max(self) -> None: + values = [float(i) for i in range(10)] + stats = ScoreStats.from_values("acc", values) + assert stats.max == 9 + assert stats.p95 < stats.max + assert stats.p95 == pytest.approx(8.55, abs=0.01) + + def test_n_equals_2_interpolates(self) -> None: + """Inclusive percentiles work even for n=2 (interpolation).""" + stats = ScoreStats.from_values("acc", [0.0, 1.0]) + assert stats.p5 == pytest.approx(0.05, abs=0.01) + assert stats.p95 == pytest.approx(0.95, abs=0.01) + + +class TestPercentilesAccurateForLargeSamples: + def test_n_equals_100_uniform_distribution(self) -> None: + """For uniform 0..99, p5 ≈ 5 and p95 ≈ 95 (inclusive method).""" + values = [float(i) for i in range(100)] + stats = ScoreStats.from_values("acc", values) + assert stats.p5 == pytest.approx(4.95, abs=0.1) + assert stats.p95 == pytest.approx(94.05, abs=0.1) + + def test_unsorted_input_is_sorted_internally(self) -> None: + """from_values must not depend on input order.""" + ordered = ScoreStats.from_values("a", [0.1, 0.2, 0.3, 0.4, 0.5]) + shuffled = ScoreStats.from_values("a", [0.3, 0.5, 0.1, 0.4, 0.2]) + assert ordered.p5 == pytest.approx(shuffled.p5) + assert ordered.p95 == pytest.approx(shuffled.p95) + assert ordered.median == pytest.approx(shuffled.median) + + +class TestBasicStatsStillCorrect: + """Mean/median/min/max/count are unchanged.""" + + def test_mean_and_median(self) -> None: + stats = ScoreStats.from_values("acc", [1.0, 2.0, 3.0, 4.0, 5.0]) + assert stats.mean == pytest.approx(3.0) + assert stats.median == pytest.approx(3.0) + + def test_min_max_count(self) -> None: + stats = ScoreStats.from_values("acc", [0.2, 0.7, 0.1, 0.9, 0.5]) + assert stats.min == pytest.approx(0.1) + assert stats.max == pytest.approx(0.9) + assert stats.count == 5 From a7f29ccc7f523d41b796fd1905230b18a8f61aba Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Sat, 25 Apr 2026 10:50:12 +0200 Subject: [PATCH 39/60] chore: address review minors (m2, m3, m4, m6, m7, m10, m11) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - m2: replace `lambda` with `functools.partial` in CLI command dispatch (`protest/cli/main.py`). - m3: route `EvalResultsWriter` "Results: ..." line through `console.print` instead of builtin `print`, so it bypasses test capture consistently. - m4: `Evaluator.__call__` now always returns a fresh clone in the re-binding path; removes the surprising `f is f()` identity. - m6: replace `"tests"` sentinel for `_default_suite_name` with `None`, fall back to the literal `"tests"` only when no test suite registered. A user-defined suite literally named `"tests"` no longer collides with the default-detection heuristic. - m7: add a Contents section (TOC) to `docs/evals.md` for raw-file navigability (mkdocs already auto-generates a sidebar TOC). - m10: clarify `FakeJudge.judge` comment — caller must use a dataclass with all-default fields. - m11: type `EvalSuite.eval(judge=)` as `Judge | None` (was `Any`) and document the per-eval override behavior in the docstring. Verified intentional / already-resolved: m1 (`console.print` shadow is the API), m5 (deduplicated via M3), m8 (deferred — needs PEP 696), m9 (`_canonical` resolution order is documented), m12 (`SuiteKind` is a `StrEnum`, no mismatch between str/enum comparisons). --- docs/evals.md | 21 +++++++++++++++++++++ protest/cli/main.py | 5 +++-- protest/evals/evaluator.py | 8 +++++--- protest/evals/results_writer.py | 3 ++- protest/evals/suite.py | 13 +++++++++++-- protest/history/plugin.py | 8 +++++--- tests/evals/test_judge.py | 3 ++- 7 files changed, 49 insertions(+), 12 deletions(-) diff --git a/docs/evals.md b/docs/evals.md index 562bcde..e778b22 100644 --- a/docs/evals.md +++ b/docs/evals.md @@ -2,6 +2,27 @@ Evaluate LLM outputs with scored metrics and historical tracking. +## Contents + +- [What is an Eval?](#what-is-an-eval) +- [Quick Start](#quick-start) +- [How It Works](#how-it-works) +- [EvalSuite](#evalsuite) +- [EvalCase](#evalcase) +- [Evaluators](#evaluators) +- [Fixtures](#fixtures) +- [ModelInfo](#modelinfo) +- [Judge](#judge) +- [TaskResult (SUT Usage Tracking)](#taskresult-sut-usage-tracking) +- [Usage Display](#usage-display) +- [Evaluator Errors](#evaluator-errors) +- [Name Collisions](#name-collisions) +- [Multi-Model Sessions](#multi-model-sessions) +- [CLI](#cli) +- [Output](#output) +- [History](#history) +- [Progress Output](#progress-output) + ## What is an Eval? A test produces **pass/fail**. An eval produces **scores** — numeric values (0.0–1.0) that measure output quality. Scores are aggregated across cases, tracked over time, and compared between runs. diff --git a/protest/cli/main.py b/protest/cli/main.py index 9c0b324..574825f 100644 --- a/protest/cli/main.py +++ b/protest/cli/main.py @@ -1,6 +1,7 @@ from __future__ import annotations import argparse +import functools import sys from typing import TYPE_CHECKING, Any @@ -106,8 +107,8 @@ def main() -> None: commands: dict[str, Any] = { "tags": _handle_tags_command, - "run": lambda: _handle_run_command(kind_filter="test"), - "eval": lambda: _handle_run_command(kind_filter="eval"), + "run": functools.partial(_handle_run_command, kind_filter="test"), + "eval": functools.partial(_handle_run_command, kind_filter="eval"), "history": _handle_history_command, "live": _handle_live_command, } diff --git a/protest/evals/evaluator.py b/protest/evals/evaluator.py index 242bb64..80881f9 100644 --- a/protest/evals/evaluator.py +++ b/protest/evals/evaluator.py @@ -255,9 +255,11 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any: if args and isinstance(args[0], EvalContext): merged = {**self._kwargs, **kwargs} return self._fn(*args, **merged) - if kwargs: - return Evaluator(self._fn, {**self._kwargs, **kwargs}) - return self + # Re-binding form (no EvalContext): always returns a fresh clone. + # Returning `self` for the no-kwargs case used to make `f is f()` + # accidentally true, which surprised users expecting `()` to behave + # like an evaluator constructor. + return Evaluator(self._fn, {**self._kwargs, **kwargs}) def evaluator_identity(self) -> dict[str, Any]: identity: dict[str, Any] = {"fn": self._qualname} diff --git a/protest/evals/results_writer.py b/protest/evals/results_writer.py index db64f0e..b611d6b 100644 --- a/protest/evals/results_writer.py +++ b/protest/evals/results_writer.py @@ -11,6 +11,7 @@ from pathlib import Path from typing import TYPE_CHECKING, Any +from protest import console from protest.evals.types import EvalCaseResult, EvalScore, EvalSuiteReport from protest.plugin import PluginBase @@ -62,7 +63,7 @@ def on_eval_suite_end(self, report: Any) -> None: return run_dir = self._run_dirs.get(report.suite_name) if run_dir: - print(f" Results: {run_dir}") + console.print(f" Results: {run_dir}") # --------------------------------------------------------------------------- diff --git a/protest/evals/suite.py b/protest/evals/suite.py index 905010c..c4af124 100644 --- a/protest/evals/suite.py +++ b/protest/evals/suite.py @@ -68,9 +68,18 @@ def eval( evaluators: list[Any] | None = None, tags: list[str] | None = None, timeout: float | None = None, - judge: Any = None, + judge: Judge | None = None, ) -> Callable[[FuncT], FuncT]: - """Register a scored eval test on this suite.""" + """Register a scored eval test on this suite. + + Args: + evaluators: Per-eval evaluators, appended to suite-level ones. + tags: Tags forwarded to the underlying `@suite.test`. + timeout: Per-eval timeout in seconds. + judge: Override the suite-level judge for this eval only. + Useful when one eval needs a stronger model than the rest + of the suite. Falls back to `self.judge` when omitted. + """ def decorator(func: FuncT) -> FuncT: resolved_judge = judge or self._judge diff --git a/protest/history/plugin.py b/protest/history/plugin.py index ca738a3..8827db5 100644 --- a/protest/history/plugin.py +++ b/protest/history/plugin.py @@ -44,7 +44,9 @@ def __init__(self, history_dir: Path | None = None) -> None: # Test data self._test_suites: dict[str, dict[str, dict[str, Any]]] = {} self._suite_kinds: dict[str, SuiteKind] = {} - self._default_suite_name: str = "tests" + # Bucket name for tests without a suite_path; resolved during setup + # to the first non-eval suite name, or kept as the literal fallback. + self._default_suite_name: str | None = None # Eval data self._eval_reports: dict[str, EvalSuiteReport] = {} self._eval_suite_metadata: dict[str, dict[str, Any]] = {} @@ -74,7 +76,7 @@ def setup(self, session: ProTestSession) -> None: "name": suite.judge.name, "provider": suite.judge.provider, } - elif not self._default_suite_name or self._default_suite_name == "tests": + elif self._default_suite_name is None: self._default_suite_name = suite.name # -- Test event handlers -------------------------------------------------- @@ -93,7 +95,7 @@ def _record_test(self, result: TestResult, *, passed: bool) -> None: suite_name = ( result.suite_path.root_name if result.suite_path - else self._default_suite_name + else (self._default_suite_name or "tests") ) if suite_name not in self._test_suites: self._test_suites[suite_name] = {} diff --git a/tests/evals/test_judge.py b/tests/evals/test_judge.py index e711bdb..a27ea41 100644 --- a/tests/evals/test_judge.py +++ b/tests/evals/test_judge.py @@ -41,7 +41,8 @@ async def judge(self, prompt: str, output_type: type) -> JudgeResponse: ) if output_type is str: return JudgeResponse(output=f"judged: {prompt[:20]}") - # For dataclass types, try to construct with defaults + # Dataclass fallback: caller must use a dataclass whose fields all + # have defaults — no real LLM call to derive values from. return JudgeResponse(output=output_type()) From 18078d495c879ed6f10dce7b16e910982e9bb38e Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Sat, 25 Apr 2026 14:02:04 +0200 Subject: [PATCH 40/60] ci: ensure matrix Python version consistency and add verification step - Set `UV_PYTHON` to enforce the selected Python version in the matrix. - Add a verification step to confirm the expected Python version is used. --- .github/workflows/ci.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 22a0944..dc7b06c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -73,6 +73,11 @@ jobs: - os: windows-latest python-version: "3.12" runs-on: ${{ matrix.os }} + env: + # Force uv to honor the matrix Python version. Without this, uv picks + # the newest interpreter satisfying `requires-python` (often the system + # 3.12), making the matrix cosmetic. + UV_PYTHON: ${{ matrix.python-version }} steps: - uses: actions/checkout@v6 @@ -90,6 +95,9 @@ jobs: - name: Install dependencies run: uv sync --dev + - name: Verify Python version + run: uv run python -c "import sys; v = '${{ matrix.python-version }}'; assert sys.version.startswith(v), f'expected {v}, got {sys.version}'" + - name: Run tests if: matrix.os != 'ubuntu-latest' || matrix.python-version != '3.12' run: uv run pytest -vv From 6b9cc8370aa0c67223e6fc9b19863f6d8987277a Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Sat, 25 Apr 2026 14:19:10 +0200 Subject: [PATCH 41/60] refactor: replace `StrEnum` with `str, Enum` for Python 3.10 compatibility - Updated `SuiteKind` to inherit from `str` and `Enum` instead of `StrEnum`, ensuring compatibility with Python 3.10. - Adjusted `SuiteKind.__str__` method for consistent behavior. - Modified history plugin to handle `Enum.value` directly while maintaining default behavior. - Moved `Self` import to `protest.compat` for streamlined typing support. --- protest/entities/core.py | 12 +++++++++--- protest/history/plugin.py | 3 ++- protest/plugin.py | 3 ++- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/protest/entities/core.py b/protest/entities/core.py index 5a8c680..d8b157b 100644 --- a/protest/entities/core.py +++ b/protest/entities/core.py @@ -1,7 +1,7 @@ from __future__ import annotations from dataclasses import dataclass, field -from enum import Enum, StrEnum +from enum import Enum from typing import TYPE_CHECKING, Any, TypeAlias if TYPE_CHECKING: @@ -20,12 +20,18 @@ FixtureCallable: TypeAlias = "Callable[..., Any]" -class SuiteKind(StrEnum): - """Kind of suite — determines behavior (eval wiring, history, reporting).""" +class SuiteKind(str, Enum): + """Kind of suite — determines behavior (eval wiring, history, reporting). + + Inherits from `str` (not `StrEnum`) for Python 3.10 compatibility. + """ TEST = "test" EVAL = "eval" + def __str__(self) -> str: + return self.value + class FixtureScope(Enum): """Scope level for fixtures.""" diff --git a/protest/history/plugin.py b/protest/history/plugin.py index 8827db5..00c5e8b 100644 --- a/protest/history/plugin.py +++ b/protest/history/plugin.py @@ -125,8 +125,9 @@ def on_session_end(self, result: Any) -> None: for suite_name, cases in self._test_suites.items(): total = len(cases) passed = sum(1 for c in cases.values() if c["passed"]) + kind = self._suite_kinds.get(suite_name) suites_data[suite_name] = { - "kind": str(self._suite_kinds.get(suite_name, "test")), + "kind": kind.value if kind is not None else "test", "total_cases": total, "passed": passed, "failed": total - passed, diff --git a/protest/plugin.py b/protest/plugin.py index 9589fff..895d7a5 100644 --- a/protest/plugin.py +++ b/protest/plugin.py @@ -1,12 +1,13 @@ from __future__ import annotations from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any, Self +from typing import TYPE_CHECKING, Any if TYPE_CHECKING: from argparse import ArgumentParser from collections.abc import Awaitable + from protest.compat import Self from protest.core.session import ProTestSession from protest.entities import ( FixtureInfo, From ef5a65b287ba214b5df5a8042c5dc05cf681a564 Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Sat, 25 Apr 2026 14:25:34 +0200 Subject: [PATCH 42/60] chore: remove `pydantic-evals` dependency and related code - Dropped `pydantic-evals` from dependencies and `pyproject.toml` `evals` extra. - Removed references to `pydantic-evals` in code and version reporting. - Cleaned up `uv.lock` and related metadata. --- protest/history/collector.py | 1 - pyproject.toml | 3 - uv.lock | 327 +---------------------------------- 3 files changed, 1 insertion(+), 330 deletions(-) diff --git a/protest/history/collector.py b/protest/history/collector.py index ee8bb1a..7aa8659 100644 --- a/protest/history/collector.py +++ b/protest/history/collector.py @@ -31,7 +31,6 @@ def collect_env_info() -> dict[str, Any]: return { "python_version": platform.python_version(), "protest_version": _get_pkg_version("protest"), - "pydantic_evals_version": _get_pkg_version("pydantic-evals"), "hostname": platform.node(), "os": sys.platform, "ci": ci_provider is not None, diff --git a/pyproject.toml b/pyproject.toml index 0dbe858..0cb8974 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,9 +49,6 @@ rich = [ web = [ "websockets>=12.0", ] -evals = [ - "pydantic-evals>=0.1", -] [tool.ruff] diff --git a/uv.lock b/uv.lock index e4d7032..7594a42 100644 --- a/uv.lock +++ b/uv.lock @@ -2,29 +2,6 @@ version = 1 revision = 3 requires-python = ">=3.10" -[[package]] -name = "annotated-types" -version = "0.7.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" }, -] - -[[package]] -name = "anyio" -version = "4.13.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, - { name = "idna" }, - { name = "typing-extensions", marker = "python_full_version < '3.13'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/19/14/2c5dd9f512b66549ae92767a9c7b330ae88e1932ca57876909410251fe13/anyio-4.13.0.tar.gz", hash = "sha256:334b70e641fd2221c1505b3890c69882fe4a2df910cba14d97019b90b24439dc", size = 231622, upload-time = "2026-03-24T12:59:09.671Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/da/42/e921fccf5015463e32a3cf6ee7f980a6ed0f395ceeaa45060b61d86486c2/anyio-4.13.0-py3-none-any.whl", hash = "sha256:08b310f9e24a9594186fd75b4f73f4a4152069e3853f1ed8bfbf58369f4ad708", size = 114353, upload-time = "2026-03-24T12:59:08.246Z" }, -] - [[package]] name = "attrs" version = "25.4.0" @@ -328,19 +305,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/76/91/7216b27286936c16f5b4d0c530087e4a54eead683e6b0b73dd0c64844af6/filelock-3.20.0-py3-none-any.whl", hash = "sha256:339b4732ffda5cd79b13f4e2711a31b0365ce445d95d243bb996273d072546a2", size = 16054, upload-time = "2025-10-08T18:03:48.35Z" }, ] -[[package]] -name = "genai-prices" -version = "0.0.56" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "httpx" }, - { name = "pydantic" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/44/6b/94b3018a672c7775edfb485f0fed8f6068fba75e49b067e8a1ac5eb96764/genai_prices-0.0.56.tar.gz", hash = "sha256:ac24b16a84d0ab97539bfa48dfa4649689de8e3ce71c12ebacef29efb1998045", size = 65872, upload-time = "2026-03-20T20:33:00.732Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a3/f6/8ef7e4c286deb2709d11ca96a5237caae3ef4876ab3c48095856cfd2df30/genai_prices-0.0.56-py3-none-any.whl", hash = "sha256:dbe86be8f3f556bed1b72209ed36851fec8b01793b3b220f42921a4e7da945f6", size = 68966, upload-time = "2026-03-20T20:33:02.555Z" }, -] - [[package]] name = "ghp-import" version = "2.1.0" @@ -353,52 +317,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f7/ec/67fbef5d497f86283db54c22eec6f6140243aae73265799baaaa19cd17fb/ghp_import-2.1.0-py3-none-any.whl", hash = "sha256:8337dd7b50877f163d4c0289bc1f1c7f127550241988d568c1db512c4324a619", size = 11034, upload-time = "2022-05-02T15:47:14.552Z" }, ] -[[package]] -name = "griffelib" -version = "2.0.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/9d/82/74f4a3310cdabfbb10da554c3a672847f1ed33c6f61dd472681ce7f1fe67/griffelib-2.0.2.tar.gz", hash = "sha256:3cf20b3bc470e83763ffbf236e0076b1211bac1bc67de13daf494640f2de707e", size = 166461, upload-time = "2026-03-27T11:34:51.091Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/11/8c/c9138d881c79aa0ea9ed83cbd58d5ca75624378b38cee225dcf5c42cc91f/griffelib-2.0.2-py3-none-any.whl", hash = "sha256:925c857658fb1ba40c0772c37acbc2ab650bd794d9c1b9726922e36ea4117ea1", size = 142357, upload-time = "2026-03-27T11:34:46.275Z" }, -] - -[[package]] -name = "h11" -version = "0.16.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" }, -] - -[[package]] -name = "httpcore" -version = "1.0.9" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "certifi" }, - { name = "h11" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484, upload-time = "2025-04-24T22:06:22.219Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784, upload-time = "2025-04-24T22:06:20.566Z" }, -] - -[[package]] -name = "httpx" -version = "0.28.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "anyio" }, - { name = "certifi" }, - { name = "httpcore" }, - { name = "idna" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406, upload-time = "2024-12-06T15:37:23.222Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, -] - [[package]] name = "identify" version = "2.6.15" @@ -417,18 +335,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" }, ] -[[package]] -name = "importlib-metadata" -version = "8.7.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "zipp" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/f3/49/3b30cad09e7771a4982d9975a8cbf64f00d4a1ececb53297f1d9a7be1b10/importlib_metadata-8.7.1.tar.gz", hash = "sha256:49fef1ae6440c182052f407c8d34a68f72efc36db9ca90dc0113398f2fdde8bb", size = 57107, upload-time = "2025-12-21T10:00:19.278Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/fa/5e/f8e9a1d23b9c20a551a8a02ea3637b4642e22c2626e3a13a9a29cdea99eb/importlib_metadata-8.7.1-py3-none-any.whl", hash = "sha256:5a1f80bf1daa489495071efbb095d75a634cf28a8bc299581244063b53176151", size = 27865, upload-time = "2025-12-21T10:00:18.329Z" }, -] - [[package]] name = "iniconfig" version = "2.1.0" @@ -562,15 +468,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b2/c8/d148e041732d631fc76036f8b30fae4e77b027a1e95b7a84bb522481a940/librt-0.8.1-cp314-cp314t-win_arm64.whl", hash = "sha256:bf512a71a23504ed08103a13c941f763db13fb11177beb3d9244c98c29fb4a61", size = 48755, upload-time = "2026-02-17T16:12:47.943Z" }, ] -[[package]] -name = "logfire-api" -version = "4.31.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/08/a2/8d5a3c1c282d5f2bd9f5e9ddd5288d1414a53301ce389af9016b6d82bd50/logfire_api-4.31.0.tar.gz", hash = "sha256:fc4b01257ebd4ce297ad374ed201eb1a9213b999f6ae6df45cfca5bd0ef378f8", size = 77838, upload-time = "2026-03-27T19:00:47.545Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/26/27/9372b7492b3e146908d520f8599909311cd930175801ad219171fafc6f3e/logfire_api-4.31.0-py3-none-any.whl", hash = "sha256:3c1f502fd4eb8ef0996427a5cf275fd8f327f38600650a1f53071a8171c812db", size = 123402, upload-time = "2026-03-27T19:00:44.952Z" }, -] - [[package]] name = "markdown" version = "3.10" @@ -840,19 +737,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9", size = 22314, upload-time = "2024-06-04T18:44:08.352Z" }, ] -[[package]] -name = "opentelemetry-api" -version = "1.40.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "importlib-metadata" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/2c/1d/4049a9e8698361cc1a1aa03a6c59e4fa4c71e0c0f94a30f988a6876a2ae6/opentelemetry_api-1.40.0.tar.gz", hash = "sha256:159be641c0b04d11e9ecd576906462773eb97ae1b657730f0ecf64d32071569f", size = 70851, upload-time = "2026-03-04T14:17:21.555Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/5f/bf/93795954016c522008da367da292adceed71cca6ee1717e1d64c83089099/opentelemetry_api-1.40.0-py3-none-any.whl", hash = "sha256:82dd69331ae74b06f6a874704be0cfaa49a1650e1537d4a813b86ecef7d0ecf9", size = 68676, upload-time = "2026-03-04T14:17:01.24Z" }, -] - [[package]] name = "packaging" version = "24.2" @@ -923,9 +807,6 @@ dependencies = [ ] [package.optional-dependencies] -evals = [ - { name = "pydantic-evals" }, -] rich = [ { name = "rich" }, ] @@ -953,12 +834,11 @@ docs = [ [package.metadata] requires-dist = [ - { name = "pydantic-evals", marker = "extra == 'evals'", specifier = ">=0.1" }, { name = "rich", marker = "extra == 'rich'", specifier = ">=13.0" }, { name = "typing-extensions", specifier = ">=4.15.0" }, { name = "websockets", marker = "extra == 'web'", specifier = ">=12.0" }, ] -provides-extras = ["rich", "web", "evals"] +provides-extras = ["rich", "web"] [package.metadata.requires-dev] dev = [ @@ -978,190 +858,6 @@ docs = [ { name = "mkdocs-material", specifier = ">=9.7.0" }, ] -[[package]] -name = "pydantic" -version = "2.12.5" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "annotated-types" }, - { name = "pydantic-core" }, - { name = "typing-extensions" }, - { name = "typing-inspection" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/69/44/36f1a6e523abc58ae5f928898e4aca2e0ea509b5aa6f6f392a5d882be928/pydantic-2.12.5.tar.gz", hash = "sha256:4d351024c75c0f085a9febbb665ce8c0c6ec5d30e903bdb6394b7ede26aebb49", size = 821591, upload-time = "2025-11-26T15:11:46.471Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/5a/87/b70ad306ebb6f9b585f114d0ac2137d792b48be34d732d60e597c2f8465a/pydantic-2.12.5-py3-none-any.whl", hash = "sha256:e561593fccf61e8a20fc46dfc2dfe075b8be7d0188df33f221ad1f0139180f9d", size = 463580, upload-time = "2025-11-26T15:11:44.605Z" }, -] - -[[package]] -name = "pydantic-ai-slim" -version = "1.73.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, - { name = "genai-prices" }, - { name = "griffelib" }, - { name = "httpx" }, - { name = "opentelemetry-api" }, - { name = "pydantic" }, - { name = "pydantic-graph" }, - { name = "typing-inspection" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/6a/1b/a5e18c7c721a3cfce5b17f86cb99e4142fcb70f38ea6d2b8963c2df445e1/pydantic_ai_slim-1.73.0.tar.gz", hash = "sha256:758d5bedb4b4f484c433672639bfc87af216a38453b1539ae10928a9ca62ff62", size = 497208, upload-time = "2026-03-27T03:49:49.459Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/04/3b/6aa1874cd0ccbc83c17c8eb308834bf004c8d4344c27cd8048851d4b284d/pydantic_ai_slim-1.73.0-py3-none-any.whl", hash = "sha256:f7176ce6c78539e1070d7e22549186862c2f6e6ea8b05b3aaad8a1942ba1ff4f", size = 638701, upload-time = "2026-03-27T03:49:42.804Z" }, -] - -[[package]] -name = "pydantic-core" -version = "2.41.5" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/71/70/23b021c950c2addd24ec408e9ab05d59b035b39d97cdc1130e1bce647bb6/pydantic_core-2.41.5.tar.gz", hash = "sha256:08daa51ea16ad373ffd5e7606252cc32f07bc72b28284b6bc9c6df804816476e", size = 460952, upload-time = "2025-11-04T13:43:49.098Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c6/90/32c9941e728d564b411d574d8ee0cf09b12ec978cb22b294995bae5549a5/pydantic_core-2.41.5-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:77b63866ca88d804225eaa4af3e664c5faf3568cea95360d21f4725ab6e07146", size = 2107298, upload-time = "2025-11-04T13:39:04.116Z" }, - { url = "https://files.pythonhosted.org/packages/fb/a8/61c96a77fe28993d9a6fb0f4127e05430a267b235a124545d79fea46dd65/pydantic_core-2.41.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dfa8a0c812ac681395907e71e1274819dec685fec28273a28905df579ef137e2", size = 1901475, upload-time = "2025-11-04T13:39:06.055Z" }, - { url = "https://files.pythonhosted.org/packages/5d/b6/338abf60225acc18cdc08b4faef592d0310923d19a87fba1faf05af5346e/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5921a4d3ca3aee735d9fd163808f5e8dd6c6972101e4adbda9a4667908849b97", size = 1918815, upload-time = "2025-11-04T13:39:10.41Z" }, - { url = "https://files.pythonhosted.org/packages/d1/1c/2ed0433e682983d8e8cba9c8d8ef274d4791ec6a6f24c58935b90e780e0a/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e25c479382d26a2a41b7ebea1043564a937db462816ea07afa8a44c0866d52f9", size = 2065567, upload-time = "2025-11-04T13:39:12.244Z" }, - { url = "https://files.pythonhosted.org/packages/b3/24/cf84974ee7d6eae06b9e63289b7b8f6549d416b5c199ca2d7ce13bbcf619/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f547144f2966e1e16ae626d8ce72b4cfa0caedc7fa28052001c94fb2fcaa1c52", size = 2230442, upload-time = "2025-11-04T13:39:13.962Z" }, - { url = "https://files.pythonhosted.org/packages/fd/21/4e287865504b3edc0136c89c9c09431be326168b1eb7841911cbc877a995/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6f52298fbd394f9ed112d56f3d11aabd0d5bd27beb3084cc3d8ad069483b8941", size = 2350956, upload-time = "2025-11-04T13:39:15.889Z" }, - { url = "https://files.pythonhosted.org/packages/a8/76/7727ef2ffa4b62fcab916686a68a0426b9b790139720e1934e8ba797e238/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:100baa204bb412b74fe285fb0f3a385256dad1d1879f0a5cb1499ed2e83d132a", size = 2068253, upload-time = "2025-11-04T13:39:17.403Z" }, - { url = "https://files.pythonhosted.org/packages/d5/8c/a4abfc79604bcb4c748e18975c44f94f756f08fb04218d5cb87eb0d3a63e/pydantic_core-2.41.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:05a2c8852530ad2812cb7914dc61a1125dc4e06252ee98e5638a12da6cc6fb6c", size = 2177050, upload-time = "2025-11-04T13:39:19.351Z" }, - { url = "https://files.pythonhosted.org/packages/67/b1/de2e9a9a79b480f9cb0b6e8b6ba4c50b18d4e89852426364c66aa82bb7b3/pydantic_core-2.41.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:29452c56df2ed968d18d7e21f4ab0ac55e71dc59524872f6fc57dcf4a3249ed2", size = 2147178, upload-time = "2025-11-04T13:39:21Z" }, - { url = "https://files.pythonhosted.org/packages/16/c1/dfb33f837a47b20417500efaa0378adc6635b3c79e8369ff7a03c494b4ac/pydantic_core-2.41.5-cp310-cp310-musllinux_1_1_armv7l.whl", hash = "sha256:d5160812ea7a8a2ffbe233d8da666880cad0cbaf5d4de74ae15c313213d62556", size = 2341833, upload-time = "2025-11-04T13:39:22.606Z" }, - { url = "https://files.pythonhosted.org/packages/47/36/00f398642a0f4b815a9a558c4f1dca1b4020a7d49562807d7bc9ff279a6c/pydantic_core-2.41.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:df3959765b553b9440adfd3c795617c352154e497a4eaf3752555cfb5da8fc49", size = 2321156, upload-time = "2025-11-04T13:39:25.843Z" }, - { url = "https://files.pythonhosted.org/packages/7e/70/cad3acd89fde2010807354d978725ae111ddf6d0ea46d1ea1775b5c1bd0c/pydantic_core-2.41.5-cp310-cp310-win32.whl", hash = "sha256:1f8d33a7f4d5a7889e60dc39856d76d09333d8a6ed0f5f1190635cbec70ec4ba", size = 1989378, upload-time = "2025-11-04T13:39:27.92Z" }, - { url = "https://files.pythonhosted.org/packages/76/92/d338652464c6c367e5608e4488201702cd1cbb0f33f7b6a85a60fe5f3720/pydantic_core-2.41.5-cp310-cp310-win_amd64.whl", hash = "sha256:62de39db01b8d593e45871af2af9e497295db8d73b085f6bfd0b18c83c70a8f9", size = 2013622, upload-time = "2025-11-04T13:39:29.848Z" }, - { url = "https://files.pythonhosted.org/packages/e8/72/74a989dd9f2084b3d9530b0915fdda64ac48831c30dbf7c72a41a5232db8/pydantic_core-2.41.5-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:a3a52f6156e73e7ccb0f8cced536adccb7042be67cb45f9562e12b319c119da6", size = 2105873, upload-time = "2025-11-04T13:39:31.373Z" }, - { url = "https://files.pythonhosted.org/packages/12/44/37e403fd9455708b3b942949e1d7febc02167662bf1a7da5b78ee1ea2842/pydantic_core-2.41.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7f3bf998340c6d4b0c9a2f02d6a400e51f123b59565d74dc60d252ce888c260b", size = 1899826, upload-time = "2025-11-04T13:39:32.897Z" }, - { url = "https://files.pythonhosted.org/packages/33/7f/1d5cab3ccf44c1935a359d51a8a2a9e1a654b744b5e7f80d41b88d501eec/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:378bec5c66998815d224c9ca994f1e14c0c21cb95d2f52b6021cc0b2a58f2a5a", size = 1917869, upload-time = "2025-11-04T13:39:34.469Z" }, - { url = "https://files.pythonhosted.org/packages/6e/6a/30d94a9674a7fe4f4744052ed6c5e083424510be1e93da5bc47569d11810/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e7b576130c69225432866fe2f4a469a85a54ade141d96fd396dffcf607b558f8", size = 2063890, upload-time = "2025-11-04T13:39:36.053Z" }, - { url = "https://files.pythonhosted.org/packages/50/be/76e5d46203fcb2750e542f32e6c371ffa9b8ad17364cf94bb0818dbfb50c/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6cb58b9c66f7e4179a2d5e0f849c48eff5c1fca560994d6eb6543abf955a149e", size = 2229740, upload-time = "2025-11-04T13:39:37.753Z" }, - { url = "https://files.pythonhosted.org/packages/d3/ee/fed784df0144793489f87db310a6bbf8118d7b630ed07aa180d6067e653a/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:88942d3a3dff3afc8288c21e565e476fc278902ae4d6d134f1eeda118cc830b1", size = 2350021, upload-time = "2025-11-04T13:39:40.94Z" }, - { url = "https://files.pythonhosted.org/packages/c8/be/8fed28dd0a180dca19e72c233cbf58efa36df055e5b9d90d64fd1740b828/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f31d95a179f8d64d90f6831d71fa93290893a33148d890ba15de25642c5d075b", size = 2066378, upload-time = "2025-11-04T13:39:42.523Z" }, - { url = "https://files.pythonhosted.org/packages/b0/3b/698cf8ae1d536a010e05121b4958b1257f0b5522085e335360e53a6b1c8b/pydantic_core-2.41.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c1df3d34aced70add6f867a8cf413e299177e0c22660cc767218373d0779487b", size = 2175761, upload-time = "2025-11-04T13:39:44.553Z" }, - { url = "https://files.pythonhosted.org/packages/b8/ba/15d537423939553116dea94ce02f9c31be0fa9d0b806d427e0308ec17145/pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:4009935984bd36bd2c774e13f9a09563ce8de4abaa7226f5108262fa3e637284", size = 2146303, upload-time = "2025-11-04T13:39:46.238Z" }, - { url = "https://files.pythonhosted.org/packages/58/7f/0de669bf37d206723795f9c90c82966726a2ab06c336deba4735b55af431/pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:34a64bc3441dc1213096a20fe27e8e128bd3ff89921706e83c0b1ac971276594", size = 2340355, upload-time = "2025-11-04T13:39:48.002Z" }, - { url = "https://files.pythonhosted.org/packages/e5/de/e7482c435b83d7e3c3ee5ee4451f6e8973cff0eb6007d2872ce6383f6398/pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c9e19dd6e28fdcaa5a1de679aec4141f691023916427ef9bae8584f9c2fb3b0e", size = 2319875, upload-time = "2025-11-04T13:39:49.705Z" }, - { url = "https://files.pythonhosted.org/packages/fe/e6/8c9e81bb6dd7560e33b9053351c29f30c8194b72f2d6932888581f503482/pydantic_core-2.41.5-cp311-cp311-win32.whl", hash = "sha256:2c010c6ded393148374c0f6f0bf89d206bf3217f201faa0635dcd56bd1520f6b", size = 1987549, upload-time = "2025-11-04T13:39:51.842Z" }, - { url = "https://files.pythonhosted.org/packages/11/66/f14d1d978ea94d1bc21fc98fcf570f9542fe55bfcc40269d4e1a21c19bf7/pydantic_core-2.41.5-cp311-cp311-win_amd64.whl", hash = "sha256:76ee27c6e9c7f16f47db7a94157112a2f3a00e958bc626e2f4ee8bec5c328fbe", size = 2011305, upload-time = "2025-11-04T13:39:53.485Z" }, - { url = "https://files.pythonhosted.org/packages/56/d8/0e271434e8efd03186c5386671328154ee349ff0354d83c74f5caaf096ed/pydantic_core-2.41.5-cp311-cp311-win_arm64.whl", hash = "sha256:4bc36bbc0b7584de96561184ad7f012478987882ebf9f9c389b23f432ea3d90f", size = 1972902, upload-time = "2025-11-04T13:39:56.488Z" }, - { url = "https://files.pythonhosted.org/packages/5f/5d/5f6c63eebb5afee93bcaae4ce9a898f3373ca23df3ccaef086d0233a35a7/pydantic_core-2.41.5-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:f41a7489d32336dbf2199c8c0a215390a751c5b014c2c1c5366e817202e9cdf7", size = 2110990, upload-time = "2025-11-04T13:39:58.079Z" }, - { url = "https://files.pythonhosted.org/packages/aa/32/9c2e8ccb57c01111e0fd091f236c7b371c1bccea0fa85247ac55b1e2b6b6/pydantic_core-2.41.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:070259a8818988b9a84a449a2a7337c7f430a22acc0859c6b110aa7212a6d9c0", size = 1896003, upload-time = "2025-11-04T13:39:59.956Z" }, - { url = "https://files.pythonhosted.org/packages/68/b8/a01b53cb0e59139fbc9e4fda3e9724ede8de279097179be4ff31f1abb65a/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e96cea19e34778f8d59fe40775a7a574d95816eb150850a85a7a4c8f4b94ac69", size = 1919200, upload-time = "2025-11-04T13:40:02.241Z" }, - { url = "https://files.pythonhosted.org/packages/38/de/8c36b5198a29bdaade07b5985e80a233a5ac27137846f3bc2d3b40a47360/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ed2e99c456e3fadd05c991f8f437ef902e00eedf34320ba2b0842bd1c3ca3a75", size = 2052578, upload-time = "2025-11-04T13:40:04.401Z" }, - { url = "https://files.pythonhosted.org/packages/00/b5/0e8e4b5b081eac6cb3dbb7e60a65907549a1ce035a724368c330112adfdd/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:65840751b72fbfd82c3c640cff9284545342a4f1eb1586ad0636955b261b0b05", size = 2208504, upload-time = "2025-11-04T13:40:06.072Z" }, - { url = "https://files.pythonhosted.org/packages/77/56/87a61aad59c7c5b9dc8caad5a41a5545cba3810c3e828708b3d7404f6cef/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e536c98a7626a98feb2d3eaf75944ef6f3dbee447e1f841eae16f2f0a72d8ddc", size = 2335816, upload-time = "2025-11-04T13:40:07.835Z" }, - { url = "https://files.pythonhosted.org/packages/0d/76/941cc9f73529988688a665a5c0ecff1112b3d95ab48f81db5f7606f522d3/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eceb81a8d74f9267ef4081e246ffd6d129da5d87e37a77c9bde550cb04870c1c", size = 2075366, upload-time = "2025-11-04T13:40:09.804Z" }, - { url = "https://files.pythonhosted.org/packages/d3/43/ebef01f69baa07a482844faaa0a591bad1ef129253ffd0cdaa9d8a7f72d3/pydantic_core-2.41.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d38548150c39b74aeeb0ce8ee1d8e82696f4a4e16ddc6de7b1d8823f7de4b9b5", size = 2171698, upload-time = "2025-11-04T13:40:12.004Z" }, - { url = "https://files.pythonhosted.org/packages/b1/87/41f3202e4193e3bacfc2c065fab7706ebe81af46a83d3e27605029c1f5a6/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c23e27686783f60290e36827f9c626e63154b82b116d7fe9adba1fda36da706c", size = 2132603, upload-time = "2025-11-04T13:40:13.868Z" }, - { url = "https://files.pythonhosted.org/packages/49/7d/4c00df99cb12070b6bccdef4a195255e6020a550d572768d92cc54dba91a/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:482c982f814460eabe1d3bb0adfdc583387bd4691ef00b90575ca0d2b6fe2294", size = 2329591, upload-time = "2025-11-04T13:40:15.672Z" }, - { url = "https://files.pythonhosted.org/packages/cc/6a/ebf4b1d65d458f3cda6a7335d141305dfa19bdc61140a884d165a8a1bbc7/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:bfea2a5f0b4d8d43adf9d7b8bf019fb46fdd10a2e5cde477fbcb9d1fa08c68e1", size = 2319068, upload-time = "2025-11-04T13:40:17.532Z" }, - { url = "https://files.pythonhosted.org/packages/49/3b/774f2b5cd4192d5ab75870ce4381fd89cf218af999515baf07e7206753f0/pydantic_core-2.41.5-cp312-cp312-win32.whl", hash = "sha256:b74557b16e390ec12dca509bce9264c3bbd128f8a2c376eaa68003d7f327276d", size = 1985908, upload-time = "2025-11-04T13:40:19.309Z" }, - { url = "https://files.pythonhosted.org/packages/86/45/00173a033c801cacf67c190fef088789394feaf88a98a7035b0e40d53dc9/pydantic_core-2.41.5-cp312-cp312-win_amd64.whl", hash = "sha256:1962293292865bca8e54702b08a4f26da73adc83dd1fcf26fbc875b35d81c815", size = 2020145, upload-time = "2025-11-04T13:40:21.548Z" }, - { url = "https://files.pythonhosted.org/packages/f9/22/91fbc821fa6d261b376a3f73809f907cec5ca6025642c463d3488aad22fb/pydantic_core-2.41.5-cp312-cp312-win_arm64.whl", hash = "sha256:1746d4a3d9a794cacae06a5eaaccb4b8643a131d45fbc9af23e353dc0a5ba5c3", size = 1976179, upload-time = "2025-11-04T13:40:23.393Z" }, - { url = "https://files.pythonhosted.org/packages/87/06/8806241ff1f70d9939f9af039c6c35f2360cf16e93c2ca76f184e76b1564/pydantic_core-2.41.5-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:941103c9be18ac8daf7b7adca8228f8ed6bb7a1849020f643b3a14d15b1924d9", size = 2120403, upload-time = "2025-11-04T13:40:25.248Z" }, - { url = "https://files.pythonhosted.org/packages/94/02/abfa0e0bda67faa65fef1c84971c7e45928e108fe24333c81f3bfe35d5f5/pydantic_core-2.41.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:112e305c3314f40c93998e567879e887a3160bb8689ef3d2c04b6cc62c33ac34", size = 1896206, upload-time = "2025-11-04T13:40:27.099Z" }, - { url = "https://files.pythonhosted.org/packages/15/df/a4c740c0943e93e6500f9eb23f4ca7ec9bf71b19e608ae5b579678c8d02f/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0cbaad15cb0c90aa221d43c00e77bb33c93e8d36e0bf74760cd00e732d10a6a0", size = 1919307, upload-time = "2025-11-04T13:40:29.806Z" }, - { url = "https://files.pythonhosted.org/packages/9a/e3/6324802931ae1d123528988e0e86587c2072ac2e5394b4bc2bc34b61ff6e/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:03ca43e12fab6023fc79d28ca6b39b05f794ad08ec2feccc59a339b02f2b3d33", size = 2063258, upload-time = "2025-11-04T13:40:33.544Z" }, - { url = "https://files.pythonhosted.org/packages/c9/d4/2230d7151d4957dd79c3044ea26346c148c98fbf0ee6ebd41056f2d62ab5/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dc799088c08fa04e43144b164feb0c13f9a0bc40503f8df3e9fde58a3c0c101e", size = 2214917, upload-time = "2025-11-04T13:40:35.479Z" }, - { url = "https://files.pythonhosted.org/packages/e6/9f/eaac5df17a3672fef0081b6c1bb0b82b33ee89aa5cec0d7b05f52fd4a1fa/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:97aeba56665b4c3235a0e52b2c2f5ae9cd071b8a8310ad27bddb3f7fb30e9aa2", size = 2332186, upload-time = "2025-11-04T13:40:37.436Z" }, - { url = "https://files.pythonhosted.org/packages/cf/4e/35a80cae583a37cf15604b44240e45c05e04e86f9cfd766623149297e971/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:406bf18d345822d6c21366031003612b9c77b3e29ffdb0f612367352aab7d586", size = 2073164, upload-time = "2025-11-04T13:40:40.289Z" }, - { url = "https://files.pythonhosted.org/packages/bf/e3/f6e262673c6140dd3305d144d032f7bd5f7497d3871c1428521f19f9efa2/pydantic_core-2.41.5-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b93590ae81f7010dbe380cdeab6f515902ebcbefe0b9327cc4804d74e93ae69d", size = 2179146, upload-time = "2025-11-04T13:40:42.809Z" }, - { url = "https://files.pythonhosted.org/packages/75/c7/20bd7fc05f0c6ea2056a4565c6f36f8968c0924f19b7d97bbfea55780e73/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:01a3d0ab748ee531f4ea6c3e48ad9dac84ddba4b0d82291f87248f2f9de8d740", size = 2137788, upload-time = "2025-11-04T13:40:44.752Z" }, - { url = "https://files.pythonhosted.org/packages/3a/8d/34318ef985c45196e004bc46c6eab2eda437e744c124ef0dbe1ff2c9d06b/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:6561e94ba9dacc9c61bce40e2d6bdc3bfaa0259d3ff36ace3b1e6901936d2e3e", size = 2340133, upload-time = "2025-11-04T13:40:46.66Z" }, - { url = "https://files.pythonhosted.org/packages/9c/59/013626bf8c78a5a5d9350d12e7697d3d4de951a75565496abd40ccd46bee/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:915c3d10f81bec3a74fbd4faebe8391013ba61e5a1a8d48c4455b923bdda7858", size = 2324852, upload-time = "2025-11-04T13:40:48.575Z" }, - { url = "https://files.pythonhosted.org/packages/1a/d9/c248c103856f807ef70c18a4f986693a46a8ffe1602e5d361485da502d20/pydantic_core-2.41.5-cp313-cp313-win32.whl", hash = "sha256:650ae77860b45cfa6e2cdafc42618ceafab3a2d9a3811fcfbd3bbf8ac3c40d36", size = 1994679, upload-time = "2025-11-04T13:40:50.619Z" }, - { url = "https://files.pythonhosted.org/packages/9e/8b/341991b158ddab181cff136acd2552c9f35bd30380422a639c0671e99a91/pydantic_core-2.41.5-cp313-cp313-win_amd64.whl", hash = "sha256:79ec52ec461e99e13791ec6508c722742ad745571f234ea6255bed38c6480f11", size = 2019766, upload-time = "2025-11-04T13:40:52.631Z" }, - { url = "https://files.pythonhosted.org/packages/73/7d/f2f9db34af103bea3e09735bb40b021788a5e834c81eedb541991badf8f5/pydantic_core-2.41.5-cp313-cp313-win_arm64.whl", hash = "sha256:3f84d5c1b4ab906093bdc1ff10484838aca54ef08de4afa9de0f5f14d69639cd", size = 1981005, upload-time = "2025-11-04T13:40:54.734Z" }, - { url = "https://files.pythonhosted.org/packages/ea/28/46b7c5c9635ae96ea0fbb779e271a38129df2550f763937659ee6c5dbc65/pydantic_core-2.41.5-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:3f37a19d7ebcdd20b96485056ba9e8b304e27d9904d233d7b1015db320e51f0a", size = 2119622, upload-time = "2025-11-04T13:40:56.68Z" }, - { url = "https://files.pythonhosted.org/packages/74/1a/145646e5687e8d9a1e8d09acb278c8535ebe9e972e1f162ed338a622f193/pydantic_core-2.41.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1d1d9764366c73f996edd17abb6d9d7649a7eb690006ab6adbda117717099b14", size = 1891725, upload-time = "2025-11-04T13:40:58.807Z" }, - { url = "https://files.pythonhosted.org/packages/23/04/e89c29e267b8060b40dca97bfc64a19b2a3cf99018167ea1677d96368273/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25e1c2af0fce638d5f1988b686f3b3ea8cd7de5f244ca147c777769e798a9cd1", size = 1915040, upload-time = "2025-11-04T13:41:00.853Z" }, - { url = "https://files.pythonhosted.org/packages/84/a3/15a82ac7bd97992a82257f777b3583d3e84bdb06ba6858f745daa2ec8a85/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:506d766a8727beef16b7adaeb8ee6217c64fc813646b424d0804d67c16eddb66", size = 2063691, upload-time = "2025-11-04T13:41:03.504Z" }, - { url = "https://files.pythonhosted.org/packages/74/9b/0046701313c6ef08c0c1cf0e028c67c770a4e1275ca73131563c5f2a310a/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4819fa52133c9aa3c387b3328f25c1facc356491e6135b459f1de698ff64d869", size = 2213897, upload-time = "2025-11-04T13:41:05.804Z" }, - { url = "https://files.pythonhosted.org/packages/8a/cd/6bac76ecd1b27e75a95ca3a9a559c643b3afcd2dd62086d4b7a32a18b169/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2b761d210c9ea91feda40d25b4efe82a1707da2ef62901466a42492c028553a2", size = 2333302, upload-time = "2025-11-04T13:41:07.809Z" }, - { url = "https://files.pythonhosted.org/packages/4c/d2/ef2074dc020dd6e109611a8be4449b98cd25e1b9b8a303c2f0fca2f2bcf7/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22f0fb8c1c583a3b6f24df2470833b40207e907b90c928cc8d3594b76f874375", size = 2064877, upload-time = "2025-11-04T13:41:09.827Z" }, - { url = "https://files.pythonhosted.org/packages/18/66/e9db17a9a763d72f03de903883c057b2592c09509ccfe468187f2a2eef29/pydantic_core-2.41.5-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2782c870e99878c634505236d81e5443092fba820f0373997ff75f90f68cd553", size = 2180680, upload-time = "2025-11-04T13:41:12.379Z" }, - { url = "https://files.pythonhosted.org/packages/d3/9e/3ce66cebb929f3ced22be85d4c2399b8e85b622db77dad36b73c5387f8f8/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:0177272f88ab8312479336e1d777f6b124537d47f2123f89cb37e0accea97f90", size = 2138960, upload-time = "2025-11-04T13:41:14.627Z" }, - { url = "https://files.pythonhosted.org/packages/a6/62/205a998f4327d2079326b01abee48e502ea739d174f0a89295c481a2272e/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_armv7l.whl", hash = "sha256:63510af5e38f8955b8ee5687740d6ebf7c2a0886d15a6d65c32814613681bc07", size = 2339102, upload-time = "2025-11-04T13:41:16.868Z" }, - { url = "https://files.pythonhosted.org/packages/3c/0d/f05e79471e889d74d3d88f5bd20d0ed189ad94c2423d81ff8d0000aab4ff/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:e56ba91f47764cc14f1daacd723e3e82d1a89d783f0f5afe9c364b8bb491ccdb", size = 2326039, upload-time = "2025-11-04T13:41:18.934Z" }, - { url = "https://files.pythonhosted.org/packages/ec/e1/e08a6208bb100da7e0c4b288eed624a703f4d129bde2da475721a80cab32/pydantic_core-2.41.5-cp314-cp314-win32.whl", hash = "sha256:aec5cf2fd867b4ff45b9959f8b20ea3993fc93e63c7363fe6851424c8a7e7c23", size = 1995126, upload-time = "2025-11-04T13:41:21.418Z" }, - { url = "https://files.pythonhosted.org/packages/48/5d/56ba7b24e9557f99c9237e29f5c09913c81eeb2f3217e40e922353668092/pydantic_core-2.41.5-cp314-cp314-win_amd64.whl", hash = "sha256:8e7c86f27c585ef37c35e56a96363ab8de4e549a95512445b85c96d3e2f7c1bf", size = 2015489, upload-time = "2025-11-04T13:41:24.076Z" }, - { url = "https://files.pythonhosted.org/packages/4e/bb/f7a190991ec9e3e0ba22e4993d8755bbc4a32925c0b5b42775c03e8148f9/pydantic_core-2.41.5-cp314-cp314-win_arm64.whl", hash = "sha256:e672ba74fbc2dc8eea59fb6d4aed6845e6905fc2a8afe93175d94a83ba2a01a0", size = 1977288, upload-time = "2025-11-04T13:41:26.33Z" }, - { url = "https://files.pythonhosted.org/packages/92/ed/77542d0c51538e32e15afe7899d79efce4b81eee631d99850edc2f5e9349/pydantic_core-2.41.5-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:8566def80554c3faa0e65ac30ab0932b9e3a5cd7f8323764303d468e5c37595a", size = 2120255, upload-time = "2025-11-04T13:41:28.569Z" }, - { url = "https://files.pythonhosted.org/packages/bb/3d/6913dde84d5be21e284439676168b28d8bbba5600d838b9dca99de0fad71/pydantic_core-2.41.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:b80aa5095cd3109962a298ce14110ae16b8c1aece8b72f9dafe81cf597ad80b3", size = 1863760, upload-time = "2025-11-04T13:41:31.055Z" }, - { url = "https://files.pythonhosted.org/packages/5a/f0/e5e6b99d4191da102f2b0eb9687aaa7f5bea5d9964071a84effc3e40f997/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3006c3dd9ba34b0c094c544c6006cc79e87d8612999f1a5d43b769b89181f23c", size = 1878092, upload-time = "2025-11-04T13:41:33.21Z" }, - { url = "https://files.pythonhosted.org/packages/71/48/36fb760642d568925953bcc8116455513d6e34c4beaa37544118c36aba6d/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:72f6c8b11857a856bcfa48c86f5368439f74453563f951e473514579d44aa612", size = 2053385, upload-time = "2025-11-04T13:41:35.508Z" }, - { url = "https://files.pythonhosted.org/packages/20/25/92dc684dd8eb75a234bc1c764b4210cf2646479d54b47bf46061657292a8/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5cb1b2f9742240e4bb26b652a5aeb840aa4b417c7748b6f8387927bc6e45e40d", size = 2218832, upload-time = "2025-11-04T13:41:37.732Z" }, - { url = "https://files.pythonhosted.org/packages/e2/09/f53e0b05023d3e30357d82eb35835d0f6340ca344720a4599cd663dca599/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bd3d54f38609ff308209bd43acea66061494157703364ae40c951f83ba99a1a9", size = 2327585, upload-time = "2025-11-04T13:41:40Z" }, - { url = "https://files.pythonhosted.org/packages/aa/4e/2ae1aa85d6af35a39b236b1b1641de73f5a6ac4d5a7509f77b814885760c/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ff4321e56e879ee8d2a879501c8e469414d948f4aba74a2d4593184eb326660", size = 2041078, upload-time = "2025-11-04T13:41:42.323Z" }, - { url = "https://files.pythonhosted.org/packages/cd/13/2e215f17f0ef326fc72afe94776edb77525142c693767fc347ed6288728d/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d0d2568a8c11bf8225044aa94409e21da0cb09dcdafe9ecd10250b2baad531a9", size = 2173914, upload-time = "2025-11-04T13:41:45.221Z" }, - { url = "https://files.pythonhosted.org/packages/02/7a/f999a6dcbcd0e5660bc348a3991c8915ce6599f4f2c6ac22f01d7a10816c/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:a39455728aabd58ceabb03c90e12f71fd30fa69615760a075b9fec596456ccc3", size = 2129560, upload-time = "2025-11-04T13:41:47.474Z" }, - { url = "https://files.pythonhosted.org/packages/3a/b1/6c990ac65e3b4c079a4fb9f5b05f5b013afa0f4ed6780a3dd236d2cbdc64/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_armv7l.whl", hash = "sha256:239edca560d05757817c13dc17c50766136d21f7cd0fac50295499ae24f90fdf", size = 2329244, upload-time = "2025-11-04T13:41:49.992Z" }, - { url = "https://files.pythonhosted.org/packages/d9/02/3c562f3a51afd4d88fff8dffb1771b30cfdfd79befd9883ee094f5b6c0d8/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:2a5e06546e19f24c6a96a129142a75cee553cc018ffee48a460059b1185f4470", size = 2331955, upload-time = "2025-11-04T13:41:54.079Z" }, - { url = "https://files.pythonhosted.org/packages/5c/96/5fb7d8c3c17bc8c62fdb031c47d77a1af698f1d7a406b0f79aaa1338f9ad/pydantic_core-2.41.5-cp314-cp314t-win32.whl", hash = "sha256:b4ececa40ac28afa90871c2cc2b9ffd2ff0bf749380fbdf57d165fd23da353aa", size = 1988906, upload-time = "2025-11-04T13:41:56.606Z" }, - { url = "https://files.pythonhosted.org/packages/22/ed/182129d83032702912c2e2d8bbe33c036f342cc735737064668585dac28f/pydantic_core-2.41.5-cp314-cp314t-win_amd64.whl", hash = "sha256:80aa89cad80b32a912a65332f64a4450ed00966111b6615ca6816153d3585a8c", size = 1981607, upload-time = "2025-11-04T13:41:58.889Z" }, - { url = "https://files.pythonhosted.org/packages/9f/ed/068e41660b832bb0b1aa5b58011dea2a3fe0ba7861ff38c4d4904c1c1a99/pydantic_core-2.41.5-cp314-cp314t-win_arm64.whl", hash = "sha256:35b44f37a3199f771c3eaa53051bc8a70cd7b54f333531c59e29fd4db5d15008", size = 1974769, upload-time = "2025-11-04T13:42:01.186Z" }, - { url = "https://files.pythonhosted.org/packages/11/72/90fda5ee3b97e51c494938a4a44c3a35a9c96c19bba12372fb9c634d6f57/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:b96d5f26b05d03cc60f11a7761a5ded1741da411e7fe0909e27a5e6a0cb7b034", size = 2115441, upload-time = "2025-11-04T13:42:39.557Z" }, - { url = "https://files.pythonhosted.org/packages/1f/53/8942f884fa33f50794f119012dc6a1a02ac43a56407adaac20463df8e98f/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:634e8609e89ceecea15e2d61bc9ac3718caaaa71963717bf3c8f38bfde64242c", size = 1930291, upload-time = "2025-11-04T13:42:42.169Z" }, - { url = "https://files.pythonhosted.org/packages/79/c8/ecb9ed9cd942bce09fc888ee960b52654fbdbede4ba6c2d6e0d3b1d8b49c/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:93e8740d7503eb008aa2df04d3b9735f845d43ae845e6dcd2be0b55a2da43cd2", size = 1948632, upload-time = "2025-11-04T13:42:44.564Z" }, - { url = "https://files.pythonhosted.org/packages/2e/1b/687711069de7efa6af934e74f601e2a4307365e8fdc404703afc453eab26/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f15489ba13d61f670dcc96772e733aad1a6f9c429cc27574c6cdaed82d0146ad", size = 2138905, upload-time = "2025-11-04T13:42:47.156Z" }, - { url = "https://files.pythonhosted.org/packages/09/32/59b0c7e63e277fa7911c2fc70ccfb45ce4b98991e7ef37110663437005af/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:7da7087d756b19037bc2c06edc6c170eeef3c3bafcb8f532ff17d64dc427adfd", size = 2110495, upload-time = "2025-11-04T13:42:49.689Z" }, - { url = "https://files.pythonhosted.org/packages/aa/81/05e400037eaf55ad400bcd318c05bb345b57e708887f07ddb2d20e3f0e98/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:aabf5777b5c8ca26f7824cb4a120a740c9588ed58df9b2d196ce92fba42ff8dc", size = 1915388, upload-time = "2025-11-04T13:42:52.215Z" }, - { url = "https://files.pythonhosted.org/packages/6e/0d/e3549b2399f71d56476b77dbf3cf8937cec5cd70536bdc0e374a421d0599/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c007fe8a43d43b3969e8469004e9845944f1a80e6acd47c150856bb87f230c56", size = 1942879, upload-time = "2025-11-04T13:42:56.483Z" }, - { url = "https://files.pythonhosted.org/packages/f7/07/34573da085946b6a313d7c42f82f16e8920bfd730665de2d11c0c37a74b5/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:76d0819de158cd855d1cbb8fcafdf6f5cf1eb8e470abe056d5d161106e38062b", size = 2139017, upload-time = "2025-11-04T13:42:59.471Z" }, - { url = "https://files.pythonhosted.org/packages/e6/b0/1a2aa41e3b5a4ba11420aba2d091b2d17959c8d1519ece3627c371951e73/pydantic_core-2.41.5-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:b5819cd790dbf0c5eb9f82c73c16b39a65dd6dd4d1439dcdea7816ec9adddab8", size = 2103351, upload-time = "2025-11-04T13:43:02.058Z" }, - { url = "https://files.pythonhosted.org/packages/a4/ee/31b1f0020baaf6d091c87900ae05c6aeae101fa4e188e1613c80e4f1ea31/pydantic_core-2.41.5-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:5a4e67afbc95fa5c34cf27d9089bca7fcab4e51e57278d710320a70b956d1b9a", size = 1925363, upload-time = "2025-11-04T13:43:05.159Z" }, - { url = "https://files.pythonhosted.org/packages/e1/89/ab8e86208467e467a80deaca4e434adac37b10a9d134cd2f99b28a01e483/pydantic_core-2.41.5-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ece5c59f0ce7d001e017643d8d24da587ea1f74f6993467d85ae8a5ef9d4f42b", size = 2135615, upload-time = "2025-11-04T13:43:08.116Z" }, - { url = "https://files.pythonhosted.org/packages/99/0a/99a53d06dd0348b2008f2f30884b34719c323f16c3be4e6cc1203b74a91d/pydantic_core-2.41.5-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:16f80f7abe3351f8ea6858914ddc8c77e02578544a0ebc15b4c2e1a0e813b0b2", size = 2175369, upload-time = "2025-11-04T13:43:12.49Z" }, - { url = "https://files.pythonhosted.org/packages/6d/94/30ca3b73c6d485b9bb0bc66e611cff4a7138ff9736b7e66bcf0852151636/pydantic_core-2.41.5-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:33cb885e759a705b426baada1fe68cbb0a2e68e34c5d0d0289a364cf01709093", size = 2144218, upload-time = "2025-11-04T13:43:15.431Z" }, - { url = "https://files.pythonhosted.org/packages/87/57/31b4f8e12680b739a91f472b5671294236b82586889ef764b5fbc6669238/pydantic_core-2.41.5-pp310-pypy310_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:c8d8b4eb992936023be7dee581270af5c6e0697a8559895f527f5b7105ecd36a", size = 2329951, upload-time = "2025-11-04T13:43:18.062Z" }, - { url = "https://files.pythonhosted.org/packages/7d/73/3c2c8edef77b8f7310e6fb012dbc4b8551386ed575b9eb6fb2506e28a7eb/pydantic_core-2.41.5-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:242a206cd0318f95cd21bdacff3fcc3aab23e79bba5cac3db5a841c9ef9c6963", size = 2318428, upload-time = "2025-11-04T13:43:20.679Z" }, - { url = "https://files.pythonhosted.org/packages/2f/02/8559b1f26ee0d502c74f9cca5c0d2fd97e967e083e006bbbb4e97f3a043a/pydantic_core-2.41.5-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d3a978c4f57a597908b7e697229d996d77a6d3c94901e9edee593adada95ce1a", size = 2147009, upload-time = "2025-11-04T13:43:23.286Z" }, - { url = "https://files.pythonhosted.org/packages/5f/9b/1b3f0e9f9305839d7e84912f9e8bfbd191ed1b1ef48083609f0dabde978c/pydantic_core-2.41.5-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:b2379fa7ed44ddecb5bfe4e48577d752db9fc10be00a6b7446e9663ba143de26", size = 2101980, upload-time = "2025-11-04T13:43:25.97Z" }, - { url = "https://files.pythonhosted.org/packages/a4/ed/d71fefcb4263df0da6a85b5d8a7508360f2f2e9b3bf5814be9c8bccdccc1/pydantic_core-2.41.5-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:266fb4cbf5e3cbd0b53669a6d1b039c45e3ce651fd5442eff4d07c2cc8d66808", size = 1923865, upload-time = "2025-11-04T13:43:28.763Z" }, - { url = "https://files.pythonhosted.org/packages/ce/3a/626b38db460d675f873e4444b4bb030453bbe7b4ba55df821d026a0493c4/pydantic_core-2.41.5-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58133647260ea01e4d0500089a8c4f07bd7aa6ce109682b1426394988d8aaacc", size = 2134256, upload-time = "2025-11-04T13:43:31.71Z" }, - { url = "https://files.pythonhosted.org/packages/83/d9/8412d7f06f616bbc053d30cb4e5f76786af3221462ad5eee1f202021eb4e/pydantic_core-2.41.5-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:287dad91cfb551c363dc62899a80e9e14da1f0e2b6ebde82c806612ca2a13ef1", size = 2174762, upload-time = "2025-11-04T13:43:34.744Z" }, - { url = "https://files.pythonhosted.org/packages/55/4c/162d906b8e3ba3a99354e20faa1b49a85206c47de97a639510a0e673f5da/pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:03b77d184b9eb40240ae9fd676ca364ce1085f203e1b1256f8ab9984dca80a84", size = 2143141, upload-time = "2025-11-04T13:43:37.701Z" }, - { url = "https://files.pythonhosted.org/packages/1f/f2/f11dd73284122713f5f89fc940f370d035fa8e1e078d446b3313955157fe/pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:a668ce24de96165bb239160b3d854943128f4334822900534f2fe947930e5770", size = 2330317, upload-time = "2025-11-04T13:43:40.406Z" }, - { url = "https://files.pythonhosted.org/packages/88/9d/b06ca6acfe4abb296110fb1273a4d848a0bfb2ff65f3ee92127b3244e16b/pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f14f8f046c14563f8eb3f45f499cc658ab8d10072961e07225e507adb700e93f", size = 2316992, upload-time = "2025-11-04T13:43:43.602Z" }, - { url = "https://files.pythonhosted.org/packages/36/c7/cfc8e811f061c841d7990b0201912c3556bfeb99cdcb7ed24adc8d6f8704/pydantic_core-2.41.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:56121965f7a4dc965bff783d70b907ddf3d57f6eba29b6d2e5dabfaf07799c51", size = 2145302, upload-time = "2025-11-04T13:43:46.64Z" }, -] - -[[package]] -name = "pydantic-evals" -version = "1.73.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "anyio" }, - { name = "logfire-api" }, - { name = "pydantic" }, - { name = "pydantic-ai-slim" }, - { name = "pyyaml" }, - { name = "rich" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/02/45/ce1f9b97c4838f940c98693bc1d6298f0e1396355998942b095ce17157fe/pydantic_evals-1.73.0.tar.gz", hash = "sha256:c1f38ad9c4f566bee6958c92f205b8200957b4baf3dd5239e2a4a06edd28e3dc", size = 56137, upload-time = "2026-03-27T03:49:50.861Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/01/4e/aefc34a68adc165ddec22c0632cb3076579c46751ac11acdf8cec6462891/pydantic_evals-1.73.0-py3-none-any.whl", hash = "sha256:0609210d4825cc8339b5cb649be38321450b46d6e87d72c1ffde73598741fd5a", size = 67143, upload-time = "2026-03-27T03:49:44.298Z" }, -] - -[[package]] -name = "pydantic-graph" -version = "1.73.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "httpx" }, - { name = "logfire-api" }, - { name = "pydantic" }, - { name = "typing-inspection" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/1a/22/d479ea32e3c712c6711e41157fb975d81582e5171510e4c662f21a85e9fe/pydantic_graph-1.73.0.tar.gz", hash = "sha256:f0d3e4984af1d902cdda1ccd3fcd86949d45d3ed21559e781f7cf9eace2ed914", size = 58717, upload-time = "2026-03-27T03:49:51.967Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/08/b3/4cc0b1c543b8a0c1f9add7bdeb2e8cd583961a795664a1a74d1fc8200416/pydantic_graph-1.73.0-py3-none-any.whl", hash = "sha256:aaab8b1580885f5108401db0a7da58d6c7643e467eb626b8a1364b1030327de0", size = 72504, upload-time = "2026-03-27T03:49:45.668Z" }, -] - [[package]] name = "pygments" version = "2.19.2" @@ -1574,18 +1270,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" }, ] -[[package]] -name = "typing-inspection" -version = "0.4.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/55/e3/70399cb7dd41c10ac53367ae42139cf4b1ca5f36bb3dc6c9d33acdb43655/typing_inspection-0.4.2.tar.gz", hash = "sha256:ba561c48a67c5958007083d386c3295464928b01faa735ab8547c5692e87f464", size = 75949, upload-time = "2025-10-01T02:14:41.687Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" }, -] - [[package]] name = "urllib3" version = "2.5.0" @@ -1700,12 +1384,3 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/68/a1/dcb68430b1d00b698ae7a7e0194433bce4f07ded185f0ee5fb21e2a2e91e/websockets-15.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:cad21560da69f4ce7658ca2cb83138fb4cf695a2ba3e475e0559e05991aa8122", size = 176884, upload-time = "2025-03-05T20:03:27.934Z" }, { url = "https://files.pythonhosted.org/packages/fa/a8/5b41e0da817d64113292ab1f8247140aac61cbf6cfd085d6a0fa77f4984f/websockets-15.0.1-py3-none-any.whl", hash = "sha256:f7a866fbc1e97b5c617ee4116daaa09b722101d4a3c170c787450ba409f9736f", size = 169743, upload-time = "2025-03-05T20:03:39.41Z" }, ] - -[[package]] -name = "zipp" -version = "3.23.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/e3/02/0f2892c661036d50ede074e376733dca2ae7c6eb617489437771209d4180/zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166", size = 25547, upload-time = "2025-06-08T17:06:39.4Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e", size = 10276, upload-time = "2025-06-08T17:06:38.034Z" }, -] From 72a8457546084569deb4d221b9df26503cc59dd8 Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Sat, 25 Apr 2026 14:33:49 +0200 Subject: [PATCH 43/60] tests(history): ensure `--runs` displays newest entries first MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Added `TestRunsOrderRecentFirst` to validate that `--runs` follows the git log convention, showing the most recent entries first. - Updated CLI logic to reverse storage order (oldest → newest) for display consistency. - Adjusted index formatting and numbering in both plain and rich output modes to reflect the newest-first display. --- protest/cli/history.py | 11 +++++---- tests/test_history_cli.py | 48 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 4 deletions(-) diff --git a/protest/cli/history.py b/protest/cli/history.py index 01198d8..88c94b8 100644 --- a/protest/cli/history.py +++ b/protest/cli/history.py @@ -115,11 +115,13 @@ def stats(self, entries: list[dict[str, Any]]) -> None: print() def runs(self, entries: list[dict[str, Any]]) -> None: - for i, e in enumerate(entries): + # Display most-recent first (git log convention). `entries` arrives + # sorted oldest→newest from storage, so we reverse for display. + for i, e in enumerate(reversed(entries)): p, t, r = _entry_stats(e) git = (e.get("git") or {}).get("commit_short", "?") ts = e.get("timestamp", "?")[:16] - print(f"\n #{len(entries) - i:<3} {ts} {p}/{t} ({r * 100:.0f}%) {git}") + print(f"\n #{i + 1:<3} {ts} {p}/{t} ({r * 100:.0f}%) {git}") for sn, sd in e.get("suites", {}).items(): if not isinstance(sd, dict): continue @@ -212,13 +214,14 @@ def stats(self, entries: list[dict[str, Any]]) -> None: def runs(self, entries: list[dict[str, Any]]) -> None: self.console.print() - for i, e in enumerate(entries): + # Display most-recent first (git log convention). + for i, e in enumerate(reversed(entries)): p, t, r = _entry_stats(e) git = (e.get("git") or {}).get("commit_short", "?") ts = e.get("timestamp", "?")[:16] rate_color = "green" if r >= 0.8 else "yellow" if r >= 0.5 else "red" self.console.print( - f" [dim]#{len(entries) - i:<3}[/] {ts} " + f" [dim]#{i + 1:<3}[/] {ts} " f"[{rate_color}]{p}/{t} ({r * 100:.0f}%)[/] [dim]{git}[/]" ) for sn, sd in e.get("suites", {}).items(): diff --git a/tests/test_history_cli.py b/tests/test_history_cli.py index 70e81b2..b19e5cb 100644 --- a/tests/test_history_cli.py +++ b/tests/test_history_cli.py @@ -16,6 +16,7 @@ import pytest from protest.cli.history import handle_history_command +from protest.history.storage import HISTORY_FILE, append_entry if TYPE_CHECKING: from pathlib import Path @@ -110,3 +111,50 @@ def test_help_output_shows_action_and_kind_groups( stdout = capsys.readouterr().out assert "[--runs | --show [N] | --compare]" in stdout assert "[--evals | --tests]" in stdout + + +class TestRunsOrderRecentFirst: + """`--runs` lists most-recent run first (git log convention). + + Storage returns entries oldest→newest; the CLI must reverse for display + so #1 maps to the newest run, matching `git stash list` / `git log`. + """ + + def _seed(self, tmp_path: Path, commits: list[tuple[str, str]]) -> None: + path = tmp_path / HISTORY_FILE + for ts, commit in commits: + append_entry( + path, + { + "schema_version": 1, + "run_id": commit, + "timestamp": ts, + "git": {"commit_short": commit}, + "suites": {}, + }, + ) + + def test_runs_displays_newest_first( + self, + tmp_path: Path, + capsys: pytest.CaptureFixture[str], + ) -> None: + # Seed in chronological order — storage preserves write order. + self._seed( + tmp_path, + [ + ("2026-04-25T10:00:00", "old1234"), + ("2026-04-25T11:00:00", "mid5678"), + ("2026-04-25T12:00:00", "newabcd"), + ], + ) + handle_history_command(["--runs", "--path", str(tmp_path)]) + stdout = capsys.readouterr().out + # #1 is newest, #3 is oldest. + assert stdout.index("#1") < stdout.index("#2") < stdout.index("#3") + assert ( + stdout.index("newabcd") < stdout.index("mid5678") < stdout.index("old1234") + ) + # And #1 lines up with the newest commit, not the oldest. + newest_line = next(line for line in stdout.splitlines() if "#1" in line) + assert "newabcd" in newest_line From 8b64322d7973b8c2fa8c1b85f6938381e970344c Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Sat, 25 Apr 2026 14:38:24 +0200 Subject: [PATCH 44/60] refactor(evals): replace `keyword_check` with `contains_keywords` and update evaluator logic --- docs/evals.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/evals.md b/docs/evals.md index e778b22..324e0b2 100644 --- a/docs/evals.md +++ b/docs/evals.md @@ -226,10 +226,10 @@ Skip expensive evaluators (LLM judges) when cheap ones already fail: from protest.evals import ShortCircuit evaluators=[ - not_empty, # always runs + not_empty, # always runs ShortCircuit([ - contains_expected_facts(min_score=0.3), # 0ms — if fail → stop - llm_judge(rubric="factual accuracy"), # 3s — skipped if above fails + contains_keywords(keywords=["paris"], min_recall=0.5), # 0ms — if fail → stop + llm_judge(rubric="factual accuracy"), # 3s — skipped if above fails ]), ] ``` @@ -243,7 +243,7 @@ evaluators=[ evaluators=[not_empty] # With params → call to bind -evaluators=[keyword_check(keywords=["python", "async"], min_recall=0.75)] +evaluators=[contains_keywords(keywords=["python", "async"], min_recall=0.75)] # Per-case evaluators (added to suite-level) EvalCase(name="factual_accuracy_case", inputs="...", evaluators=[llm_judge(rubric="Check factual accuracy")]) From bf27f4ce702e6bb731442dc6d5808494954d640f Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Sat, 25 Apr 2026 14:44:25 +0200 Subject: [PATCH 45/60] tests(evals): add stricter `contains_keywords` threshold tests and improve evaluator logic documentation - Introduced tests for `min_recall` edge cases, including exact threshold passing, discontinuity fixes, and below-threshold failures. - Updated `contains_keywords` evaluator to simplify `all_keywords_present` logic and ensure consistent behavior across thresholds. - Adjusted default `min_recall` to `1.0` in docs and implementation for stricter compliance. --- docs/evals.md | 2 +- protest/evals/evaluators.py | 12 +++++++++--- tests/evals/test_e2e.py | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 43 insertions(+), 4 deletions(-) diff --git a/docs/evals.md b/docs/evals.md index 324e0b2..84b25ca 100644 --- a/docs/evals.md +++ b/docs/evals.md @@ -266,7 +266,7 @@ EvalCase(name="factual_accuracy_case", inputs="...", evaluators=[llm_judge(rubri | Evaluator | Params | Returns | |-----------|--------|---------| -| `contains_keywords` | `keywords, min_recall=0.0` | `keyword_recall: float`, `all_keywords_present: bool` | +| `contains_keywords` | `keywords, min_recall=1.0` | `keyword_recall: float`, `all_keywords_present: bool` | | `contains_expected` | `case_sensitive=False` | `bool` | | `does_not_contain` | `forbidden` | `no_forbidden_words: bool` | | `not_empty` | — | `bool` | diff --git a/protest/evals/evaluators.py b/protest/evals/evaluators.py index ec7d9bd..dcac9b4 100644 --- a/protest/evals/evaluators.py +++ b/protest/evals/evaluators.py @@ -45,16 +45,22 @@ class WordOverlapResult: @evaluator def contains_keywords( - ctx: EvalContext[Any, str], keywords: list[str], min_recall: float = 0.0 + ctx: EvalContext[Any, str], keywords: list[str], min_recall: float = 1.0 ) -> ContainsKeywordsResult: - """Check that the output contains expected keywords (case-insensitive).""" + """Check that the output contains expected keywords (case-insensitive). + + `min_recall` is the minimum fraction of keywords that must appear for + the verdict to pass. Default `1.0` requires all keywords to be present; + set to `0.5` for "at least half", `0.0` to ignore the verdict and only + track the metric. + """ output_lower = ctx.output.lower() found = sum(1 for kw in keywords if kw.lower() in output_lower) total = len(keywords) recall = found / total if total else 1.0 return ContainsKeywordsResult( keyword_recall=recall, - all_keywords_present=recall >= min_recall if min_recall > 0 else found == total, + all_keywords_present=recall >= min_recall, ) diff --git a/tests/evals/test_e2e.py b/tests/evals/test_e2e.py index f1fc5d1..921cc39 100644 --- a/tests/evals/test_e2e.py +++ b/tests/evals/test_e2e.py @@ -721,6 +721,39 @@ def test_contains_keywords(self) -> None: assert result.keyword_recall == 1.0 assert result.all_keywords_present is True + def test_contains_keywords_default_requires_all(self) -> None: + """Default `min_recall=1.0` means strict: missing one → verdict False.""" + e = contains_keywords(keywords=["hello", "world"]) + result = e(self._make_ctx("Only hello here")) + assert result.keyword_recall == 0.5 + assert result.all_keywords_present is False + + def test_contains_keywords_threshold_continuity_at_zero(self) -> None: + """Regression: `min_recall=0.0` must always pass (no discontinuity at 0). + + Earlier behavior: `min_recall=0.0` flipped to strict mode (all required), + while `min_recall=0.0001` was permissive — surprising at the boundary. + Now `recall >= min_recall` applies uniformly. + """ + e = contains_keywords(keywords=["alpha", "beta"], min_recall=0.0) + result = e(self._make_ctx("nothing matches")) + assert result.keyword_recall == 0.0 + assert result.all_keywords_present is True + + def test_contains_keywords_threshold_at_exact_value(self) -> None: + """Verdict passes when recall equals the threshold exactly.""" + e = contains_keywords(keywords=["alpha", "beta"], min_recall=0.5) + result = e(self._make_ctx("only alpha here")) + assert result.keyword_recall == 0.5 + assert result.all_keywords_present is True + + def test_contains_keywords_threshold_just_below(self) -> None: + """Verdict fails when recall is below the threshold.""" + e = contains_keywords(keywords=["alpha", "beta", "gamma"], min_recall=0.5) + result = e(self._make_ctx("only alpha")) + assert abs(result.keyword_recall - 1 / 3) < 1e-9 + assert result.all_keywords_present is False + def test_contains_expected(self) -> None: e = contains_expected assert e(self._make_ctx("Hello World", "world")) is True From bfa9d14106728b5bd78cc2420e8ce2cd3f113c2d Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Sat, 25 Apr 2026 14:51:41 +0200 Subject: [PATCH 46/60] tests(evals): add `not_empty` tests for Sized containers and clarify evaluator behavior - Added tests to ensure `not_empty` correctly handles empty and non-empty lists, dicts, and sets. - Updated `not_empty` docstring and logic to explicitly check `Sized` objects using `len()`. --- protest/evals/evaluators.py | 11 ++++++++++- tests/evals/test_e2e.py | 38 +++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/protest/evals/evaluators.py b/protest/evals/evaluators.py index dcac9b4..8866961 100644 --- a/protest/evals/evaluators.py +++ b/protest/evals/evaluators.py @@ -9,6 +9,7 @@ import json as json_module import re +from collections.abc import Sized from dataclasses import dataclass from typing import Annotated, Any @@ -86,11 +87,19 @@ def does_not_contain( @evaluator def not_empty(ctx: EvalContext[Any, Any]) -> bool: - """Check that the output is not empty or whitespace-only.""" + """Check that the output is not empty. + + - `None` -> False. + - `str`: False if empty or whitespace-only. + - Sized (list, dict, set, tuple, ...): False if `len() == 0`. + - Other (int, float, dataclass, custom objects): True. + """ if ctx.output is None: return False if isinstance(ctx.output, str): return len(ctx.output.strip()) > 0 + if isinstance(ctx.output, Sized): + return len(ctx.output) > 0 return True diff --git a/tests/evals/test_e2e.py b/tests/evals/test_e2e.py index 921cc39..5e86c18 100644 --- a/tests/evals/test_e2e.py +++ b/tests/evals/test_e2e.py @@ -769,6 +769,44 @@ def test_not_empty(self) -> None: assert not_empty(self._make_ctx("")) is False assert not_empty(self._make_ctx(" ")) is False + def test_not_empty_handles_sized_containers(self) -> None: + """Sized containers: empty -> False, non-empty -> True. + + Earlier behavior fell through to `return True` for any non-string, + so `not_empty([])` reported True — misleading for tasks that return + lists/dicts (e.g. tool calls, retrieved chunks). + """ + # Helper accepts Any at runtime; type hint is just a default. + ctx_empty_list: Any = self._make_ctx("") + ctx_empty_list.output = [] + assert not_empty(ctx_empty_list) is False + + ctx_nonempty_list: Any = self._make_ctx("") + ctx_nonempty_list.output = [1, 2] + assert not_empty(ctx_nonempty_list) is True + + ctx_empty_dict: Any = self._make_ctx("") + ctx_empty_dict.output = {} + assert not_empty(ctx_empty_dict) is False + + ctx_nonempty_dict: Any = self._make_ctx("") + ctx_nonempty_dict.output = {"a": 1} + assert not_empty(ctx_nonempty_dict) is True + + ctx_empty_set: Any = self._make_ctx("") + ctx_empty_set.output = set() + assert not_empty(ctx_empty_set) is False + + def test_not_empty_unsized_objects_still_pass(self) -> None: + """Non-Sized values (int, float, dataclass): always True (kept as-is).""" + ctx_int: Any = self._make_ctx("") + ctx_int.output = 42 + assert not_empty(ctx_int) is True + + ctx_zero: Any = self._make_ctx("") + ctx_zero.output = 0 # 0 is not None, not Sized — still passes. + assert not_empty(ctx_zero) is True + def test_max_length(self) -> None: e = max_length(max_chars=5) result = e(self._make_ctx("hi")) From e54f17912ee10fc1f00b9be0bedf3134178913a9 Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Sat, 25 Apr 2026 15:05:51 +0200 Subject: [PATCH 47/60] tests(evals): add precision tests for sub-millisecond durations and adaptive formatting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Added tests to ensure `_serialize_eval_case` preserves 10 µs precision, preventing sub-ms durations from collapsing to `0.0`. - Introduced `_format_case_duration` tests for adaptive time unit rendering across microseconds, milliseconds, and seconds. - Updated markdown renderer to use `_format_case_duration` for task durations. - Increased duration serialization precision from 3 to 5 decimals in history plugin. --- protest/evals/results_writer.py | 26 +++++++-- protest/history/plugin.py | 9 +++- tests/evals/test_duration_precision.py | 75 ++++++++++++++++++++++++++ 3 files changed, 103 insertions(+), 7 deletions(-) create mode 100644 tests/evals/test_duration_precision.py diff --git a/protest/evals/results_writer.py b/protest/evals/results_writer.py index b611d6b..983ac8b 100644 --- a/protest/evals/results_writer.py +++ b/protest/evals/results_writer.py @@ -90,11 +90,7 @@ def _write_case_file(case: EvalCaseResult, run_dir: Path) -> None: def _render_case(case: EvalCaseResult) -> str: status = "PASS ✓" if case.passed else "FAIL ✗" - duration = ( - f"{case.duration * 1000:.0f}ms" - if case.duration < 1 - else f"{case.duration:.2f}s" - ) + duration = _format_case_duration(case.duration) lines: list[str] = [ f"# {case.case_name} — {status} ({duration})", "", @@ -113,6 +109,26 @@ def _render_case(case: EvalCaseResult) -> str: return "\n".join(lines) +_ONE_MILLISECOND = 0.001 +_TEN_MILLISECONDS = 0.01 +_ONE_SECOND = 1.0 + + +def _format_case_duration(seconds: float) -> str: + """Format SUT duration with adaptive units. + + Sub-ms tasks (deterministic stubs, fast classifiers) used to render as + `0ms` because the renderer rounded to the nearest millisecond. + """ + if seconds < _ONE_MILLISECOND: + return f"{seconds * 1_000_000:.0f}µs" + if seconds < _TEN_MILLISECONDS: + return f"{seconds * 1000:.2f}ms" + if seconds < _ONE_SECOND: + return f"{seconds * 1000:.0f}ms" + return f"{seconds:.2f}s" + + def _format_score(score: EvalScore) -> str: icon = "·" if score.is_metric else ("✓" if score.passed else "✗") return f"- **{score.name}**: {score.value} {icon}" diff --git a/protest/history/plugin.py b/protest/history/plugin.py index 00c5e8b..e662e14 100644 --- a/protest/history/plugin.py +++ b/protest/history/plugin.py @@ -101,7 +101,7 @@ def _record_test(self, result: TestResult, *, passed: bool) -> None: self._test_suites[suite_name] = {} self._test_suites[suite_name][result.name] = { "passed": passed, - "duration": round(result.duration, 3), + "duration": round(result.duration, 5), } # -- Eval event handlers -------------------------------------------------- @@ -216,11 +216,16 @@ def _serialize_eval_case(case: EvalCaseResult) -> dict[str, Any]: Skipped scores are excluded: a ShortCircuit skip produces `EvalScore(value=False, skipped=True)` — serializing it as an assertion would look like a real failure in `history --compare` diffs. + + `case.duration` here is `EvalPayload.task_duration` (SUT-only timing, + set by the eval wrapper), not the full TestResult duration shown by live + reporters. Persisted at 10 µs precision so sub-ms SUTs don't all hash + down to 0.0 across runs. """ entry: dict[str, Any] = { "passed": case.passed, "is_error": case.is_error, - "duration": round(case.duration, 3), + "duration": round(case.duration, 5), "scores": { s.name: s.value for s in case.scores if s.is_metric and not s.skipped }, diff --git a/tests/evals/test_duration_precision.py b/tests/evals/test_duration_precision.py new file mode 100644 index 0000000..fdd47bf --- /dev/null +++ b/tests/evals/test_duration_precision.py @@ -0,0 +1,75 @@ +"""Tests for C3 — sub-millisecond duration handling. + +The eval pipeline persists `EvalPayload.task_duration` (SUT-only timing). +For deterministic stubs / fast classifiers, that value is sub-millisecond +and the previous serializer (`round(_, 3)`) collapsed everything to `0.0`, +making run-over-run comparisons useless. The markdown renderer had the +matching bug — it printed `0ms` for any sub-ms task. +""" + +from __future__ import annotations + +from protest.evals.results_writer import _format_case_duration, _render_case +from protest.evals.types import EvalCaseResult +from protest.history.plugin import _serialize_eval_case + + +def _make_case(duration: float) -> EvalCaseResult: + return EvalCaseResult( + case_name="case", + node_id="suite::case", + scores=(), + duration=duration, + passed=True, + inputs="in", + output="out", + expected_output=None, + case_hash="h", + eval_hash="e", + is_error=False, + ) + + +class TestSerializerPrecision: + """`_serialize_eval_case` keeps 5-decimal precision (10 µs).""" + + def test_sub_millisecond_is_not_collapsed_to_zero(self) -> None: + case = _make_case(2.07e-05) # 20.7 µs + entry = _serialize_eval_case(case) + # Previously: 0.0 (round to 3 decimals) + # Now: 2e-05 (round to 5 decimals — 10 µs precision) + assert entry["duration"] > 0 + assert entry["duration"] == 2e-05 + + def test_distinct_sub_ms_values_remain_distinguishable(self) -> None: + e1 = _serialize_eval_case(_make_case(1.0e-05)) # 10 µs + e2 = _serialize_eval_case(_make_case(5.0e-05)) # 50 µs + assert e1["duration"] != e2["duration"] + + def test_millisecond_values_unchanged(self) -> None: + # >1ms: 5-decimal rounding produces the same numbers as 3-decimal. + entry = _serialize_eval_case(_make_case(0.123)) + assert entry["duration"] == 0.123 + + +class TestMarkdownDurationFormat: + """`_format_case_duration` adapts unit to magnitude.""" + + def test_microseconds_for_sub_millisecond(self) -> None: + assert _format_case_duration(2.07e-05) == "21µs" + + def test_two_decimals_in_low_milliseconds(self) -> None: + # 2.5 ms — keep one fractional digit so 1ms vs 2ms is visible. + assert _format_case_duration(0.0025) == "2.50ms" + + def test_integer_milliseconds_in_mid_range(self) -> None: + assert _format_case_duration(0.135) == "135ms" + + def test_seconds_for_one_or_more(self) -> None: + assert _format_case_duration(2.5) == "2.50s" + + def test_renders_microseconds_in_case_header(self) -> None: + case = _make_case(2.07e-05) + rendered = _render_case(case) + # Header contains the duration; previously read "(0ms)". + assert "21µs" in rendered.splitlines()[0] From 1779d4a389e8071859aa828601fadb9edfc4984b Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Sat, 25 Apr 2026 15:13:09 +0200 Subject: [PATCH 48/60] tests(console): add payload, prefix handling, and reporter tests - Added tests for 3-tuple payload behavior in `console.print` with flags `raw` and `prefix`. - Verified ASCII and Rich reporters correctly render messages with/without test prefixes and markup. - Updated `console.print` to support a new `prefix` parameter for suite-level outputs (e.g., "Results: ..."). - Adjusted `on_user_print` implementations across reporters to handle the `prefix` flag correctly. --- protest/console.py | 10 ++- protest/evals/results_writer.py | 2 +- protest/reporting/ascii.py | 5 +- protest/reporting/rich_reporter.py | 6 +- tests/test_console_print.py | 106 +++++++++++++++++++++++++++++ 5 files changed, 122 insertions(+), 7 deletions(-) create mode 100644 tests/test_console_print.py diff --git a/protest/console.py b/protest/console.py index 2d31607..56d2df6 100644 --- a/protest/console.py +++ b/protest/console.py @@ -13,6 +13,9 @@ async def pipeline(): # Raw mode — no markup processing console.print("debug: raw bytes here", raw=True) + # Section mode — no per-test prefix (use for suite/session-level lines) + console.print(f" Results: {run_dir}", prefix=False) + Messages go through the event bus → reporters display them inline. If no event bus is available (outside a protest session), falls back to stderr. """ @@ -26,7 +29,7 @@ async def pipeline(): from protest.execution.capture import get_event_bus, real_stderr -def print(msg: str, *, raw: bool = False) -> None: +def print(msg: str, *, raw: bool = False, prefix: bool = True) -> None: """Print a message that bypasses test capture. Goes through the event bus so reporters display it at the right place. @@ -35,6 +38,9 @@ def print(msg: str, *, raw: bool = False) -> None: Args: msg: The message to print. Supports Rich markup unless raw=True. raw: If True, no markup processing — message passed as-is. + prefix: If False, omit the per-test indent/bar prefix. Use for + suite-level or session-level lines (e.g. "Results: ") that + visually belong outside any single case's output block. """ bus = get_event_bus() if bus is None: @@ -49,7 +55,7 @@ def print(msg: str, *, raw: bool = False) -> None: # only caller, and console.print is never invoked from a signal handler. for handler_entry in bus._handlers.get(Event.USER_PRINT, []): with contextlib.suppress(Exception): - handler_entry.func((msg, raw)) + handler_entry.func((msg, raw, prefix)) def _fallback_print(msg: str, raw: bool) -> None: diff --git a/protest/evals/results_writer.py b/protest/evals/results_writer.py index 983ac8b..71c3725 100644 --- a/protest/evals/results_writer.py +++ b/protest/evals/results_writer.py @@ -63,7 +63,7 @@ def on_eval_suite_end(self, report: Any) -> None: return run_dir = self._run_dirs.get(report.suite_name) if run_dir: - console.print(f" Results: {run_dir}") + console.print(f" Results: {run_dir}", prefix=False) # --------------------------------------------------------------------------- diff --git a/protest/reporting/ascii.py b/protest/reporting/ascii.py index 018bedf..2233a1c 100644 --- a/protest/reporting/ascii.py +++ b/protest/reporting/ascii.py @@ -318,10 +318,11 @@ def _print_failure_detail(self, result: TestResult, *, is_error: bool) -> None: print(f" {line}") def on_user_print(self, data: Any) -> None: - msg, raw = data + msg, raw, prefix = data text = msg if raw else strip_markup(msg) stream = real_stdout() - stream.write(f" | {text}\n") + line = f" | {text}\n" if prefix and not raw else f"{text}\n" + stream.write(line) stream.flush() def on_eval_suite_end(self, report: Any) -> None: diff --git a/protest/reporting/rich_reporter.py b/protest/reporting/rich_reporter.py index bf93406..159f7bb 100644 --- a/protest/reporting/rich_reporter.py +++ b/protest/reporting/rich_reporter.py @@ -375,14 +375,16 @@ def _print_failure_detail(self, result: TestResult, *, is_error: bool) -> None: self._print(f"[dim]{escaped_line}[/]") def on_user_print(self, data: Any) -> None: - msg, raw = data + msg, raw, prefix = data # Write to the real stdout, bypassing capture stream = real_stdout() c = Console(file=stream, highlight=False) if raw: c.print(msg, markup=False) - else: + elif prefix: c.print(f"[dim] │[/] {msg}") + else: + c.print(msg) def on_eval_suite_end(self, report: Any) -> None: if not isinstance(report, EvalSuiteReport): diff --git a/tests/test_console_print.py b/tests/test_console_print.py new file mode 100644 index 0000000..956d2e2 --- /dev/null +++ b/tests/test_console_print.py @@ -0,0 +1,106 @@ +"""Tests for `protest.console.print` — payload shape and reporter formatting. + +`console.print(msg, raw=False, prefix=True)` builds a 3-tuple payload +`(msg, raw, prefix)` dispatched on USER_PRINT. Each reporter unpacks the +three flags and renders accordingly: + +- default (prefix=True, raw=False): per-test bar prefix + markup +- raw=True: no prefix, no markup (debug bytes) +- prefix=False: no prefix, markup still active (suite-level lines) + +The third mode is what unblocks `EvalResultsWriter.on_eval_suite_end` so +`Results: ` doesn't visually attach to the previous case's output. +""" + +from __future__ import annotations + +import io +from unittest.mock import MagicMock + +import pytest + +from protest import console +from protest.events.types import Event +from protest.reporting.ascii import AsciiReporter +from protest.reporting.rich_reporter import RichReporter + + +@pytest.fixture +def stdout_buffer(monkeypatch: pytest.MonkeyPatch) -> io.StringIO: + buf = io.StringIO() + # `real_stdout()` is what reporters write to; patch at both reporter modules. + monkeypatch.setattr("protest.reporting.ascii.real_stdout", lambda: buf) + monkeypatch.setattr("protest.reporting.rich_reporter.real_stdout", lambda: buf) + return buf + + +class TestAsciiReporterUserPrint: + """ASCII reporter handles the 3-tuple payload.""" + + def test_default_adds_bar_prefix(self, stdout_buffer: io.StringIO) -> None: + reporter = AsciiReporter() + reporter.on_user_print(("hello", False, True)) + assert stdout_buffer.getvalue() == " | hello\n" + + def test_raw_mode_no_prefix_no_markup(self, stdout_buffer: io.StringIO) -> None: + reporter = AsciiReporter() + reporter.on_user_print(("[bold]raw[/]", True, True)) + # raw bypasses both markup-strip and prefix + assert stdout_buffer.getvalue() == "[bold]raw[/]\n" + + def test_prefix_false_no_bar(self, stdout_buffer: io.StringIO) -> None: + reporter = AsciiReporter() + reporter.on_user_print(("Results: /tmp/foo", False, False)) + # No bar — visually a section line, not attached to a case. + assert stdout_buffer.getvalue() == "Results: /tmp/foo\n" + + +class TestRichReporterUserPrint: + """Rich reporter handles the 3-tuple payload.""" + + def _make_reporter(self) -> RichReporter: + # RichReporter pulls deps from the bus; we only exercise on_user_print. + return RichReporter.__new__(RichReporter) + + def test_default_adds_bar_prefix(self, stdout_buffer: io.StringIO) -> None: + reporter = self._make_reporter() + reporter.on_user_print(("hello", False, True)) + assert "│" in stdout_buffer.getvalue() + assert "hello" in stdout_buffer.getvalue() + + def test_prefix_false_no_bar(self, stdout_buffer: io.StringIO) -> None: + reporter = self._make_reporter() + reporter.on_user_print(("Results: /tmp/foo", False, False)) + out = stdout_buffer.getvalue() + assert "│" not in out + assert "Results: /tmp/foo" in out + + +class TestConsolePrintPayload: + """`console.print` builds the payload and dispatches to handlers.""" + + def _captured_bus(self, monkeypatch: pytest.MonkeyPatch) -> list[tuple]: + captured: list[tuple] = [] + bus = MagicMock() + handler = MagicMock() + handler.func = lambda payload: captured.append(payload) + bus._handlers = {Event.USER_PRINT: [handler]} + monkeypatch.setattr("protest.console.get_event_bus", lambda: bus) + return captured + + def test_default_payload_carries_prefix_true( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + captured = self._captured_bus(monkeypatch) + console.print("hi") + assert captured == [("hi", False, True)] + + def test_prefix_false_propagates(self, monkeypatch: pytest.MonkeyPatch) -> None: + captured = self._captured_bus(monkeypatch) + console.print("section line", prefix=False) + assert captured == [("section line", False, False)] + + def test_raw_propagates(self, monkeypatch: pytest.MonkeyPatch) -> None: + captured = self._captured_bus(monkeypatch) + console.print("[raw]", raw=True) + assert captured == [("[raw]", True, True)] From 0f25a1b74a1487347b39b5a0cb660458537d132b Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Sat, 25 Apr 2026 15:19:39 +0200 Subject: [PATCH 49/60] tests(console): surface handler errors and add fallback handling tests - Updated `console.print` to log handler exceptions to stderr, ensuring visibility for users. - Added tests for error logging, loop continuation despite stderr failures, and successful handler behavior. --- protest/console.py | 11 ++++++- tests/test_console_print.py | 65 +++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 1 deletion(-) diff --git a/protest/console.py b/protest/console.py index 56d2df6..f200d9d 100644 --- a/protest/console.py +++ b/protest/console.py @@ -54,8 +54,17 @@ def print(msg: str, *, raw: bool = False, prefix: bool = True) -> None: # that API to users. Kept private here — the framework itself is the # only caller, and console.print is never invoked from a signal handler. for handler_entry in bus._handlers.get(Event.USER_PRINT, []): - with contextlib.suppress(Exception): + try: handler_entry.func((msg, raw, prefix)) + except Exception as exc: + # Surface handler failures (typically: malformed Rich markup) on + # real stderr so users don't conclude `console.print` is silently + # broken. Wrapped in suppress() to guarantee the loop continues + # even if the fallback write itself raises. + with contextlib.suppress(Exception): + stream = real_stderr() + stream.write(f"console.print: handler raised {exc!r}\n") + stream.flush() def _fallback_print(msg: str, raw: bool) -> None: diff --git a/tests/test_console_print.py b/tests/test_console_print.py index 956d2e2..6bac47e 100644 --- a/tests/test_console_print.py +++ b/tests/test_console_print.py @@ -104,3 +104,68 @@ def test_raw_propagates(self, monkeypatch: pytest.MonkeyPatch) -> None: captured = self._captured_bus(monkeypatch) console.print("[raw]", raw=True) assert captured == [("[raw]", True, True)] + + +class TestConsolePrintHandlerErrors: + """Handler failures must surface on stderr instead of disappearing. + + Earlier behavior: `contextlib.suppress(Exception)` swallowed any handler + raise. A reporter bug (e.g. malformed Rich markup) made `console.print` + silently no-op — users assumed the call did nothing. + """ + + def _bus_with_failing_handler( + self, monkeypatch: pytest.MonkeyPatch, exc: Exception + ) -> None: + bus = MagicMock() + handler = MagicMock() + + def boom(_payload: tuple) -> None: + raise exc + + handler.func = boom + bus._handlers = {Event.USER_PRINT: [handler]} + monkeypatch.setattr("protest.console.get_event_bus", lambda: bus) + + def test_handler_exception_is_surfaced_on_stderr( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + stderr = io.StringIO() + monkeypatch.setattr("protest.console.real_stderr", lambda: stderr) + self._bus_with_failing_handler(monkeypatch, RuntimeError("boom")) + + console.print("anything") + + out = stderr.getvalue() + assert "console.print: handler raised" in out + assert "RuntimeError" in out + assert "boom" in out + + def test_loop_continues_when_real_stderr_itself_fails( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + """Defense in depth: if logging the error also fails, no cascade.""" + + def raising_stderr() -> object: + raise OSError("stderr broken") + + monkeypatch.setattr("protest.console.real_stderr", raising_stderr) + self._bus_with_failing_handler(monkeypatch, RuntimeError("boom")) + + # Must not raise — the outer suppress() is the last line of defense. + console.print("anything") + + def test_successful_handler_does_not_touch_stderr( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + stderr = io.StringIO() + monkeypatch.setattr("protest.console.real_stderr", lambda: stderr) + + bus = MagicMock() + handler = MagicMock() + handler.func = lambda _payload: None # no-op, no raise + bus._handlers = {Event.USER_PRINT: [handler]} + monkeypatch.setattr("protest.console.get_event_bus", lambda: bus) + + console.print("ok") + assert stderr.getvalue() == "" From 7a78560134d2aca9882714aff35688712156a6ac Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Sat, 25 Apr 2026 15:28:47 +0200 Subject: [PATCH 50/60] tests(history): ensure clean_dirty concurrency preserves all appends - Added tests to validate `clean_dirty` concurrency handling, ensuring no appends are silently dropped due to interleaved truncate operations. - Updated `clean_dirty` logic to use `_exclusive_file_lock` to serialize file read and write operations. - Adjusted test suite to cover concurrent `append_entry` and `clean_dirty` interactions, verifying all entries remain intact. --- protest/history/storage.py | 43 ++++---- protest/reporting/rich_reporter.py | 15 +-- .../history/test_append_entry_concurrency.py | 98 ++++++++++++++++++- 3 files changed, 132 insertions(+), 24 deletions(-) diff --git a/protest/history/storage.py b/protest/history/storage.py index 8f89fa4..3797335 100644 --- a/protest/history/storage.py +++ b/protest/history/storage.py @@ -181,6 +181,10 @@ def clean_dirty(history_dir: Path | None = None) -> int: """Remove entries where git.dirty=True AND git.commit matches current HEAD. Returns the number of entries removed. + + The read+write happens under `_exclusive_file_lock` so a concurrent + `append_entry` cannot land between our read and our truncate (which + would silently drop the new entry). """ path = (history_dir or DEFAULT_HISTORY_DIR) / HISTORY_FILE if not path.exists(): @@ -197,22 +201,27 @@ def clean_dirty(history_dir: Path | None = None) -> int: except (FileNotFoundError, subprocess.CalledProcessError): return 0 - lines = path.read_text().strip().splitlines() - kept: list[str] = [] - removed = 0 + with open(path, "r+") as f, _exclusive_file_lock(f): + f.seek(0) + lines = f.read().strip().splitlines() + kept: list[str] = [] + removed = 0 - for line in lines: - try: - entry = json.loads(line) - except json.JSONDecodeError: - kept.append(line) - continue - git = entry.get("git") or {} - if git.get("dirty") and git.get("commit") == current_commit: - removed += 1 - else: - kept.append(line) - - if removed: - path.write_text("\n".join(kept) + "\n" if kept else "") + for line in lines: + try: + entry = json.loads(line) + except json.JSONDecodeError: + kept.append(line) + continue + git = entry.get("git") or {} + if git.get("dirty") and git.get("commit") == current_commit: + removed += 1 + else: + kept.append(line) + + if removed: + f.seek(0) + f.truncate() + if kept: + f.write("\n".join(kept) + "\n") return removed diff --git a/protest/reporting/rich_reporter.py b/protest/reporting/rich_reporter.py index 159f7bb..e30584c 100644 --- a/protest/reporting/rich_reporter.py +++ b/protest/reporting/rich_reporter.py @@ -34,6 +34,11 @@ ) from protest.reporting.verbosity import Verbosity +# Per-run pass-rate thresholds for the eval suite color cue. +# Strict default — green only if every case passes; yellow above half. +_PERFECT_PASS_RATE = 1.0 +_PARTIAL_PASS_RATE = 0.5 + def _short_label(name: str, node_id: str) -> str: """name + [case_id] from node_id.""" @@ -416,18 +421,16 @@ def on_eval_suite_end(self, report: Any) -> None: self._print( f" [cyan]Eval: {report.suite_name} ({report.total_count} cases)[/]" ) - full_pass = 100 - half_pass = 50 - rate_pct = report.pass_rate * full_pass + rate = report.pass_rate color = ( "green" - if rate_pct >= full_pass + if rate >= _PERFECT_PASS_RATE else "yellow" - if rate_pct >= half_pass + if rate >= _PARTIAL_PASS_RATE else "red" ) self._print( - f" [{color}]Passed: {report.passed_count}/{report.total_count} ({rate_pct:.1f}%)[/]" + f" [{color}]Passed: {report.passed_count}/{report.total_count} ({rate * 100:.1f}%)[/]" ) if report.total_task_tokens > 0 or report.total_task_cost > 0: self._print( diff --git a/tests/history/test_append_entry_concurrency.py b/tests/history/test_append_entry_concurrency.py index 5bd3d79..ab82739 100644 --- a/tests/history/test_append_entry_concurrency.py +++ b/tests/history/test_append_entry_concurrency.py @@ -4,15 +4,19 @@ multiprocess-concurrency case: N workers append concurrently to the same file; every line must be parseable JSON. Without locking, interleaved writes larger than `PIPE_BUF` would corrupt lines and the test would fail. + +Also covers `clean_dirty` concurrency: a concurrent `append_entry` while +`clean_dirty` is running must not be silently dropped by the truncate. """ from __future__ import annotations import json import multiprocessing as mp +import subprocess from pathlib import Path -from protest.history.storage import append_entry +from protest.history.storage import append_entry, clean_dirty def _worker_append(args: tuple[str, int, int]) -> None: @@ -28,6 +32,30 @@ def _worker_append(args: tuple[str, int, int]) -> None: append_entry(path, {"worker": worker_id, "i": i, "pad": padding}) +def _worker_append_innocent(args: tuple[str, int, int]) -> None: + """Append entries on an unrelated commit — `clean_dirty` must not touch them.""" + path_str, worker_id, count = args + path = Path(path_str) + for i in range(count): + append_entry( + path, + { + "worker": worker_id, + "i": i, + "git": {"commit": "innocent_commit", "dirty": False}, + "suites": {}, + }, + ) + + +def _worker_clean_dirty(args: tuple[str, int]) -> None: + """Repeatedly run clean_dirty while another worker appends.""" + path_str, count = args + history_dir = Path(path_str).parent + for _ in range(count): + clean_dirty(history_dir=history_dir) + + class TestAppendEntryBasic: """Single-writer invariants.""" @@ -88,3 +116,71 @@ def test_concurrent_writers_do_not_interleave(self, tmp_path: Path) -> None: ) assert counts_per_worker == dict.fromkeys(range(workers), per_worker) + + +class TestCleanDirtyConcurrency: + """`clean_dirty` and `append_entry` must serialize via the same lock. + + The dangerous race: clean_dirty does (read → compute kept → truncate → + rewrite). Without a lock, an `append_entry` landing between the read + and the truncate is silently overwritten — the new entry disappears. + Here we run both in parallel and check the conserved quantity: every + appended "innocent" entry (different commit) must survive. + """ + + def test_concurrent_append_not_dropped_by_clean_dirty(self, tmp_path: Path) -> None: + # Skip outside a git repo — clean_dirty depends on `git rev-parse HEAD`. + try: + subprocess.run( + ["git", "rev-parse", "HEAD"], # noqa: S607 + capture_output=True, + text=True, + timeout=5, + check=True, + ) + except (FileNotFoundError, subprocess.CalledProcessError): + return + + path = tmp_path / "history.jsonl" + # Pre-populate with one no-op entry so the file exists for clean_dirty. + append_entry( + path, + { + "worker": -1, + "git": {"commit": "preexisting", "dirty": False}, + "suites": {}, + }, + ) + + per_worker = 30 + ctx = mp.get_context("spawn") + with ctx.Pool(2) as pool: + pool.starmap( + _dispatch_worker, + [ + ("append", str(path), 0, per_worker), + ("clean", str(path), 0, per_worker), + ], + ) + + lines = path.read_text().splitlines() + # Every line still parses (no torn writes). + innocent_count = 0 + for raw in lines: + entry = json.loads(raw) + if entry.get("git", {}).get("commit") == "innocent_commit": + innocent_count += 1 + # All `per_worker` innocent appends survived — none silently + # discarded by an interleaved clean_dirty truncate. + assert innocent_count == per_worker, ( + f"expected {per_worker} innocent entries, got {innocent_count} — " + "concurrent clean_dirty dropped some appends" + ) + + +def _dispatch_worker(kind: str, path_str: str, worker_id: int, count: int) -> None: + """Top-level dispatcher so spawn() can pickle the call.""" + if kind == "append": + _worker_append_innocent((path_str, worker_id, count)) + else: + _worker_clean_dirty((path_str, count)) From 2289485808c82dd986ea8008724cb5ffcd099699 Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Sat, 25 Apr 2026 15:34:11 +0200 Subject: [PATCH 51/60] docs(evals): add details on native LLM support and evaluator enhancements - Expanded documentation to introduce native LLM evals, including pass/fail and numeric scoring with JSONL history. - Clarified `EvalCase` benefits, tags usage, and the `metadata` dict structure. - Updated evaluator execution order, including `ShortCircuit` behavior and gating logic. - Improved `ModelInfo` explanation for history tracking and clarified its passive role in model configuration. - Added CLI examples for tags, history comparison, and evaluation workflows. --- README.md | 19 +++++++++++++++++++ docs/evals.md | 49 +++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 66 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 32af39d..41b04c2 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,24 @@ CODES = ForEach([200, 201]) def test_status(code: Annotated[int, From(CODES)]): ... ``` +### Native LLM Evals + +Score model outputs alongside your tests — same fixtures, same parallelism, same `protest` CLI. Cases get pass/fail + numeric metrics, persisted to JSONL for run-over-run comparison. + +```python +@chatbot_suite.eval(evaluators=[contains_keywords(keywords=["paris"])]) +async def chatbot(case: Annotated[EvalCase, From(cases)]) -> str: + return await my_agent(case.inputs) +``` + +```bash +protest eval evals.session:session +protest history --runs # recent runs +protest history --compare # current vs previous +``` + +See [Evals docs](https://renaudcepre.github.io/protest/evals/) for evaluators, judges, history tracking. + --- ## Quick Start @@ -120,6 +138,7 @@ protest run module:session --ctrf-output r.json # CTRF report for CI/CD - **Plugin system** - Custom reporters, filters - **Last-failed mode** - Re-run only failed tests with `--lf` - **CTRF reports** - Standardized JSON for CI/CD integration +- **Native LLM evals** - Scored cases, JSONL history, `protest eval` (see [evals docs](https://renaudcepre.github.io/protest/evals/)) ## Why Not pytest? diff --git a/docs/evals.md b/docs/evals.md index 84b25ca..1b01534 100644 --- a/docs/evals.md +++ b/docs/evals.md @@ -106,7 +106,29 @@ cases = ForEach([ | `expected` | `Any` | Expected output (passed to evaluators as `ctx.expected_output`) | | `name` | `str` | Case identifier (used in test IDs and history) | | `evaluators` | `list` | Per-case evaluators (added to suite-level ones) | -| `metadata` | `dict` | Arbitrary metadata | +| `metadata` | `dict` | Arbitrary metadata (special key: `"tags"` — see below) | + +### Why `EvalCase` and not a dict? + +The runtime reads case data via attribute access (`case.expected`, `case.metadata`, `case.evaluators`), not by string key. A plain dict would compile fine but blow up at runtime, and you'd lose the IDE refactor/Ctrl+Click affordances. Making `EvalCase` a typed dataclass surfaces typos at import time and keeps the contract one obvious place — same trade-off as `Annotated[T, Use(fn)]` over pytest's name-based fixture lookup. + +### Tags via `metadata={"tags": [...]}` + +Per-case tags piggyback on the `metadata` dict under the reserved key `"tags"`. They flow through the test collector and become first-class on the resulting `TestItem`, so `protest eval --tag slow` works out of the box. + +```python +EvalCase( + inputs="Long doc to summarize…", + expected="…", + name="long_doc_case", + metadata={"tags": ["slow", "summarization"]}, +) +``` + +```bash +protest eval evals.session:session --tag slow +protest eval evals.session:session --no-tag slow +``` ## Evaluators @@ -236,6 +258,25 @@ evaluators=[ `ShortCircuit` is a group of ordered evaluators. The first `Verdict=False` stops the group. Evaluators outside the `ShortCircuit` always run. +Execution order — `evaluators=[a, ShortCircuit([b, c]), d]`: + +``` +a ← always runs +├─ pass → continue +└─ fail → continue (a is outside the group, doesn't gate b/c) + +[ShortCircuit group ──────────────────────────────────┐ + b ← always runs (first in group) │ + ├─ pass → c │ + └─ fail → c skipped (Verdict=False stops group) │ + c ← runs only if b passed │ +└─────────────────────────────────────────────────────┘ + +d ← always runs (outside the group) +``` + +The list `evaluators=[…]` is sequential at the top level; a `ShortCircuit` is just a sub-group that may stop early. Use it to gate expensive evaluators (LLM judges) behind cheap ones (keyword/regex checks). + ### Using Evaluators ```python @@ -302,7 +343,11 @@ async def pipeline_eval( ## ModelInfo -`ModelInfo` is a **label for history tracking** — it does not configure or route to any model. It records which model produced the results so you can compare runs. +!!! warning "ModelInfo does NOT configure a model" + + Despite the name, `ModelInfo` is a **passive label** for history tracking. It does not route requests, set a temperature, pick a provider, or otherwise touch any LLM. The actual model wiring happens inside *your* task function (or the agent / SDK it calls). `ModelInfo` exists solely so `protest history` can attribute results to a specific model and compare runs side-by-side. + +`ModelInfo` records which model produced the results so you can compare runs. ```python suite = EvalSuite("pipeline", model=ModelInfo(name="qwen-2.5")) From 2f0bfcbc8c8d8274a81d96c325a2aa637dba9f4e Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Sun, 26 Apr 2026 10:48:17 +0200 Subject: [PATCH 52/60] refactor(evals): migrate `tags` from `metadata` to first-class `EvalCase` field and update tests --- docs/evals.md | 7 ++++--- examples/yorkshire/evals/dataset.py | 24 +++++++++++------------ protest/core/collector.py | 6 ++---- protest/evals/evaluator.py | 1 + tests/evals/test_evalcase_tags_wiring.py | 25 ++++++++++++------------ 5 files changed, 31 insertions(+), 32 deletions(-) diff --git a/docs/evals.md b/docs/evals.md index 1b01534..fbff5bc 100644 --- a/docs/evals.md +++ b/docs/evals.md @@ -112,16 +112,17 @@ cases = ForEach([ The runtime reads case data via attribute access (`case.expected`, `case.metadata`, `case.evaluators`), not by string key. A plain dict would compile fine but blow up at runtime, and you'd lose the IDE refactor/Ctrl+Click affordances. Making `EvalCase` a typed dataclass surfaces typos at import time and keeps the contract one obvious place — same trade-off as `Annotated[T, Use(fn)]` over pytest's name-based fixture lookup. -### Tags via `metadata={"tags": [...]}` +### Per-case `tags` -Per-case tags piggyback on the `metadata` dict under the reserved key `"tags"`. They flow through the test collector and become first-class on the resulting `TestItem`, so `protest eval --tag slow` works out of the box. +`EvalCase.tags` is a first-class field. Tags flow through the test collector and become first-class on the resulting `TestItem`, so `protest eval --tag slow` works out of the box. Use `metadata` for any other free-form annotation the framework should ignore. ```python EvalCase( inputs="Long doc to summarize…", expected="…", name="long_doc_case", - metadata={"tags": ["slow", "summarization"]}, + tags=["slow", "summarization"], + metadata={"source_dataset": "v3"}, # opaque to the framework ) ``` diff --git a/examples/yorkshire/evals/dataset.py b/examples/yorkshire/evals/dataset.py index 89b3362..423ad76 100644 --- a/examples/yorkshire/evals/dataset.py +++ b/examples/yorkshire/evals/dataset.py @@ -18,7 +18,7 @@ name="weight_question", inputs="How much does a Yorkshire Terrier weigh?", expected="2-3 kg", - metadata={"tags": ["factual", "size"]}, + tags=["factual", "size"], evaluators=[ contains_keywords(keywords=["2-3 kg", "teacup", "mini", "standard"]) ], @@ -27,21 +27,21 @@ name="grooming_basics", inputs="How often should I brush my Yorkie?", expected="daily brushing for long coats", - metadata={"tags": ["factual", "grooming"]}, + tags=["factual", "grooming"], evaluators=[contains_keywords(keywords=["daily", "brushing", "long"])], ), EvalCase( name="diet_advice", inputs="What should I feed my Yorkshire Terrier?", expected="small breed formula, 2-3 meals", - metadata={"tags": ["factual", "diet"]}, + tags=["factual", "diet"], evaluators=[contains_keywords(keywords=["small breed", "meals", "avoid"])], ), EvalCase( name="exercise_needs", inputs="How much exercise does a Yorkie need?", expected="30 minutes daily", - metadata={"tags": ["factual", "exercise"]}, + tags=["factual", "exercise"], evaluators=[contains_keywords(keywords=["30 minutes", "walk"])], ), # --- Temperament --- @@ -49,7 +49,7 @@ name="personality", inputs="What is the temperament of a Yorkshire Terrier?", expected="bold, confident, affectionate", - metadata={"tags": ["factual", "temperament"]}, + tags=["factual", "temperament"], evaluators=[ contains_keywords(keywords=["bold", "confident", "affectionate"]) ], @@ -59,14 +59,14 @@ name="puppy_care", inputs="How do I care for a Yorkshire puppy?", expected="extra care, socialization", - metadata={"tags": ["factual", "puppies"]}, + tags=["factual", "puppies"], evaluators=[contains_keywords(keywords=["12 months", "socialization"])], ), EvalCase( name="senior_care", inputs="My Yorkie is getting old, what should I change?", expected="adjust exercise, more vet visits", - metadata={"tags": ["factual", "seniors"]}, + tags=["factual", "seniors"], evaluators=[contains_keywords(keywords=["senior", "exercise", "vet"])], ), # --- Hallucination checks --- @@ -74,7 +74,7 @@ name="no_cat_advice", inputs="Tell me about Yorkshire Terrier health", expected="dental problems, patellar luxation", - metadata={"tags": ["safety"]}, + tags=["safety"], evaluators=[ does_not_contain(forbidden=["cat", "feline", "persian"]), contains_keywords(keywords=["dental", "health"]), @@ -84,7 +84,7 @@ name="no_made_up_breeds", inputs="What jobs can a Yorkie do?", expected="therapy dogs, companions", - metadata={"tags": ["safety"]}, + tags=["safety"], evaluators=[ does_not_contain(forbidden=["labrador", "golden retriever", "poodle"]), contains_keywords(keywords=["therapy", "companion"]), @@ -95,14 +95,14 @@ name="unknown_topic", inputs="What is the GDP of France?", expected="I'm not sure", - metadata={"tags": ["edge_case"]}, + tags=["edge_case"], evaluators=[contains_keywords(keywords=["not sure", "specialize"])], ), EvalCase( name="empty_question", inputs="", expected="I'm not sure", - metadata={"tags": ["edge_case"]}, + tags=["edge_case"], evaluators=[contains_keywords(keywords=["not sure"])], ), # --- Known weak spot (chatbot doesn't know about training treats) --- @@ -110,7 +110,7 @@ name="training_treats", inputs="What treats are best for training a Yorkie?", expected="small soft treats, positive reinforcement", - metadata={"tags": ["factual", "training"]}, + tags=["factual", "training"], evaluators=[ contains_keywords(keywords=["treats", "small", "soft", "reward"]) ], diff --git a/protest/core/collector.py b/protest/core/collector.py index 72743e1..565cb71 100644 --- a/protest/core/collector.py +++ b/protest/core/collector.py @@ -179,10 +179,8 @@ def _expand_registration( item_tags = tags.copy() for value in combination: - if isinstance(value, EvalCase): - case_tags = value.metadata.get("tags") - if case_tags: - item_tags.update(case_tags) + if isinstance(value, EvalCase) and value.tags: + item_tags.update(value.tags) items.append( TestItem( diff --git a/protest/evals/evaluator.py b/protest/evals/evaluator.py index 80881f9..b493967 100644 --- a/protest/evals/evaluator.py +++ b/protest/evals/evaluator.py @@ -146,6 +146,7 @@ def my_eval(case: Annotated[EvalCase, From(cases)]) -> str: name: str expected: Any = None evaluators: list[Any] = field(default_factory=list) + tags: list[str] = field(default_factory=list) metadata: dict[str, Any] = field(default_factory=dict) def __post_init__(self) -> None: diff --git a/tests/evals/test_evalcase_tags_wiring.py b/tests/evals/test_evalcase_tags_wiring.py index dbf9649..05ff8ca 100644 --- a/tests/evals/test_evalcase_tags_wiring.py +++ b/tests/evals/test_evalcase_tags_wiring.py @@ -1,8 +1,8 @@ -"""Tests for `EvalCase.metadata['tags']` → `TestItem.tags` wiring. +"""Tests for `EvalCase.tags` → `TestItem.tags` wiring. -Verifies that tags declared on an `EvalCase` via `metadata={'tags': [...]}` -are merged into the resulting `TestItem.tags` set, so that the -`TagFilterPlugin` (which filters on `TestItem.tags`) can honor them. +Verifies that tags declared on an `EvalCase` via `tags=[...]` are merged +into the resulting `TestItem.tags` set, so that the `TagFilterPlugin` +(which filters on `TestItem.tags`) can honor them. Eval functions are defined at module level to avoid `get_type_hints()` resolution issues that occur with nested function definitions. @@ -19,21 +19,19 @@ from protest.tags.plugin import TagFilterPlugin # Module-level case sources so `get_type_hints()` can resolve Annotated args. -_single_tagged = [EvalCase(inputs="x", name="c1", metadata={"tags": ["safety"]})] -_multi_tagged = [ - EvalCase(inputs="x", name="c1", metadata={"tags": ["safety", "factual"]}) -] +_single_tagged = [EvalCase(inputs="x", name="c1", tags=["safety"])] +_multi_tagged = [EvalCase(inputs="x", name="c1", tags=["safety", "factual"])] _mixed_cases = [ - EvalCase(inputs="x", name="c1", metadata={"tags": ["safety"]}), - EvalCase(inputs="y", name="c2", metadata={"tags": ["factual"]}), + EvalCase(inputs="x", name="c1", tags=["safety"]), + EvalCase(inputs="y", name="c2", tags=["factual"]), EvalCase(inputs="z", name="c3"), ] _no_tags_metadata = [ EvalCase(inputs="x", name="c1", metadata={"other": "value"}), ] _filter_cases = [ - EvalCase(inputs="a", name="c_safety", metadata={"tags": ["safety"]}), - EvalCase(inputs="b", name="c_factual", metadata={"tags": ["factual"]}), + EvalCase(inputs="a", name="c_safety", tags=["safety"]), + EvalCase(inputs="b", name="c_factual", tags=["factual"]), ] @@ -73,7 +71,8 @@ def test_cases_get_distinct_tags(self) -> None: assert "safety" not in by_name["c2"].tags assert by_name["c3"].tags == set() - def test_case_without_tags_metadata_ok(self) -> None: + def test_case_with_metadata_only_has_no_tags(self) -> None: + """`metadata` is user-free: no key (including 'tags') is interpreted.""" items = _collect(_no_tags_metadata) assert items[0].tags == set() From fa5a7ee44fe50086c71d524d1d8187d9bc46107e Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Sun, 26 Apr 2026 22:27:50 +0200 Subject: [PATCH 53/60] tests(evals): add validation for multiple `EvalCase` params and CLI flag exclusion - Added decorator-time validation to ensure eval functions declare only one `EvalCase` parameter, raising clear errors on conflicts. - Introduced tests for multiple `EvalCase` parameter rejection, covering both base and subclass scenarios. - Updated CLI parser to exclude eval-only flags (e.g., `--show-output`) from `protest run`, with tests verifying proper error handling and help content omissions. - Enhanced DI type hint resolution to handle `TYPE_CHECKING` imports and enclosing-local references. --- docs/core-concepts/dependency-injection.md | 18 ++++ protest/cli/main.py | 36 ++++--- protest/di/hints.py | 85 ++++++++++++++--- protest/evals/types.py | 10 +- protest/evals/wrapper.py | 39 +++++++- protest/exceptions.py | 20 ++++ tests/cli/test_run_command.py | 31 ++++++ tests/evals/test_multiple_evalcase_params.py | 99 ++++++++++++++++++++ 8 files changed, 310 insertions(+), 28 deletions(-) create mode 100644 tests/evals/test_multiple_evalcase_params.py diff --git a/docs/core-concepts/dependency-injection.md b/docs/core-concepts/dependency-injection.md index 2aba9c2..3e3d3e3 100644 --- a/docs/core-concepts/dependency-injection.md +++ b/docs/core-concepts/dependency-injection.md @@ -24,6 +24,24 @@ async def test_query(db: Annotated[Database, Use(database)]): The `Use` marker takes a **function reference**, not a string. This makes dependencies explicit and enables IDE navigation. +### `Type` is a hint, not a runtime check + +In `Annotated[Type, Use(fixture)]`, `Type` is a **type hint for your IDE and static checkers** — ProTest does not validate at runtime that `fixture()` actually returns a `Type`. This matches FastAPI's behavior with `Annotated[Type, Depends(fn)]`: the type is taken on faith, not enforced. + +```python +@fixture() +def returns_str() -> str: + return "hello" + +@session.test() +def test_mismatch(value: Annotated[int, Use(returns_str)]): + # `value` is actually a `str` at runtime — ProTest will not warn. + # The mismatch surfaces only when `value` is used as an `int`. + ... +``` + +In practice this is rarely a problem: keep your fixture return types and your call-site annotations aligned, and rely on `mypy`/`pyright` for the static check on the fixture itself. + ## Why Function References? Using function references instead of string names has benefits: diff --git a/protest/cli/main.py b/protest/cli/main.py index 574825f..16e76ee 100644 --- a/protest/cli/main.py +++ b/protest/cli/main.py @@ -175,11 +175,19 @@ def _create_base_parser() -> argparse.ArgumentParser: return parser -def _create_run_parser() -> argparse.ArgumentParser: - """Base parser with core run options. Plugin options added dynamically.""" +def _create_run_parser( + *, + include_eval_options: bool = False, +) -> argparse.ArgumentParser: + """Base parser with core run options. Plugin options added dynamically. + + `include_eval_options=True` adds eval-only flags (e.g. ``--show-output``). + Set when building the parser for ``protest eval``; left False for + ``protest run`` so the eval-only flags don't pollute the test help/parsing. + """ parser = argparse.ArgumentParser( - prog="protest run", - description="Run tests", + prog="protest eval" if include_eval_options else "protest run", + description="Run evals" if include_eval_options else "Run tests", ) parser.add_argument( "target", @@ -231,12 +239,6 @@ def _create_run_parser() -> argparse.ArgumentParser: default=0, help="Increase verbosity (-v for lifecycle, -vv for fixtures)", ) - parser.add_argument( - "--show-output", - dest="show_output", - action="store_true", - help="Show eval inputs/output/expected per case", - ) parser.add_argument( "--show-logs", dest="show_logs", @@ -246,6 +248,13 @@ def _create_run_parser() -> argparse.ArgumentParser: metavar="LEVEL", help="Show captured log records (default: INFO+)", ) + if include_eval_options: + parser.add_argument( + "--show-output", + dest="show_output", + action="store_true", + help="Show eval inputs/output/expected per case", + ) return parser @@ -261,6 +270,7 @@ def _handle_history_command() -> None: def _handle_run_command(kind_filter: str | None = None) -> None: """Handle 'protest run' / 'protest eval' with two-phase parsing.""" argv = sys.argv[2:] + include_eval_options = kind_filter == "eval" # Phase 1: Parse base args to get target base_parser = _create_base_parser() @@ -268,14 +278,14 @@ def _handle_run_command(kind_filter: str | None = None) -> None: # If --help without target, show full help with all plugin options if ("--help" in remaining or "-h" in remaining) and not base_args.target: - full_parser = _create_run_parser() + full_parser = _create_run_parser(include_eval_options=include_eval_options) for plugin_class in ProTestSession.default_plugin_classes(): plugin_class.add_cli_options(full_parser) full_parser.parse_args(["--help"]) return if not base_args.target: - _create_run_parser().print_help() + _create_run_parser(include_eval_options=include_eval_options).print_help() sys.exit(1) # Phase 2: Load session and register default plugins @@ -289,7 +299,7 @@ def _handle_run_command(kind_filter: str | None = None) -> None: session.register_default_plugins() # Phase 3: Build full parser with plugin options - full_parser = _create_run_parser() + full_parser = _create_run_parser(include_eval_options=include_eval_options) for plugin_class in session.plugin_classes: plugin_class.add_cli_options(full_parser) diff --git a/protest/di/hints.py b/protest/di/hints.py index bd6a89b..0af61bc 100644 --- a/protest/di/hints.py +++ b/protest/di/hints.py @@ -1,15 +1,60 @@ """Type hints resolution with PEP 563 / TYPE_CHECKING compatibility. -Shared by the core DI system and evals runner. Handles two failure modes: +Shared by the core DI system and evals runner. ``get_type_hints()`` alone +fails in two scenarios commonly encountered in ProTest user code; this +module wraps it with a cascade of fallbacks. -1. Local fixtures — ``from __future__ import annotations`` stringifies - annotations; names defined in local scopes aren't in ``func.__globals__``. - Fix: collect locals from the call stack. +------------------------------------------------------------------------ +Failure mode 1 — names defined in a local scope (PEP 563 stringification) +------------------------------------------------------------------------ -2. TYPE_CHECKING-only types — e.g. ``AsyncDriver`` imported only under - ``if TYPE_CHECKING:``. Fix: substitute ``Any`` for each unresolvable - name. The type itself is irrelevant for DI; only the ``Use(...)`` - marker inside ``Annotated[...]`` matters. +With ``from __future__ import annotations``, all annotations are stored +as strings. ``get_type_hints()`` resolves them via ``eval()`` inside +``func.__globals__`` only. Names defined in the scope of an enclosing +function are NOT in ``__globals__``, so resolution raises ``NameError``. + +The most common form of this in ProTest is a parametrized eval defined +inside a helper, where the case source is a local variable:: + + def _build_suite(cases): + source = ForEach(cases) # local to _build_suite + + @suite.eval() + def my_eval(case: Annotated[EvalCase, From(source)]) -> str: + # ^^^^^^ refers to `source`, + # which is local to _build_suite + return str(case.inputs) + +When ``get_type_hints(my_eval)`` evaluates ``"Annotated[EvalCase, From(source)]"`` +inside ``my_eval.__globals__``, ``source`` is undefined → ``NameError``. + +Fix: walk the call stack with ``inspect.stack()`` and merge every frame's +``f_locals`` into a ``localns`` dict that we pass to ``get_type_hints()`` +on retry. This is registration-time only (decorator evaluation), never +in a hot path, so the cost of ``inspect.stack()`` is acceptable. + +Trade-off: ``localns`` ends up containing every local from every frame +on the stack. Name collisions silently resolve to the most recently +seen binding. In practice no collision has been observed in this project, +because annotations only reference DI markers (``Use``/``From``) plus +small, distinctively-named locals. + +------------------------------------------------- +Failure mode 2 — TYPE_CHECKING-only imported types +------------------------------------------------- + +Types imported under ``if TYPE_CHECKING:`` are absent at runtime, so +``get_type_hints()`` raises ``NameError`` regardless of ``localns``:: + + if TYPE_CHECKING: + from heavy_module import HeavyType + + @factory() + def make() -> HeavyType: ... + +Fix: substitute ``Any`` for each unresolvable name and retry. The exact +type is irrelevant for DI dispatch — only the ``Use(...)``/``From(...)`` +marker inside ``Annotated[...]`` is consulted at injection time. """ from __future__ import annotations @@ -21,11 +66,19 @@ def get_type_hints_compat(func: Any) -> dict[str, Any]: - """Resolve type hints with PEP 563 / TYPE_CHECKING fallbacks.""" + """Resolve type hints with PEP 563 / TYPE_CHECKING fallbacks. + + See module docstring for the failure modes this function exists to + handle. Cascade: (1) plain call, (2) retry with stack-collected + ``localns``, (3) retry while substituting ``Any`` for unresolvable + names. All fallbacks run at registration time only. + """ with contextlib.suppress(Exception): return get_type_hints(func, include_extras=True) - # Build a namespace from the entire call stack (covers local fixtures). + # Build a namespace from the entire call stack so that locals from + # an enclosing helper (e.g. `source = ForEach(...)`) become visible + # to `get_type_hints`'s eval. See module docstring, failure mode 1. localns: dict[str, Any] = {} with contextlib.suppress(Exception): for frame_info in inspect.stack(): @@ -34,7 +87,8 @@ def get_type_hints_compat(func: Any) -> dict[str, Any]: with contextlib.suppress(Exception): return get_type_hints(func, localns=localns, include_extras=True) - # TYPE_CHECKING fallback: substitute Any for unresolvable names. + # Last resort for TYPE_CHECKING-only types. See module docstring, + # failure mode 2. return _get_type_hints_substituting_any(func, localns) @@ -42,7 +96,14 @@ def _get_type_hints_substituting_any( func: Any, localns: dict[str, Any], ) -> dict[str, Any]: - """Retry get_type_hints, replacing each NameError'd name with Any.""" + """Retry ``get_type_hints``, replacing each NameError'd name with ``Any``. + + Used as a last-resort fallback when a referenced type is unresolvable + at runtime (typically a TYPE_CHECKING-only import). The substituted + ``Any`` is only used as a placeholder so resolution can complete; the + DI system reads the ``Use(...)``/``From(...)`` marker out of the + ``Annotated[...]``, not the underlying type. + """ localns = dict(localns) for _ in range(20): try: diff --git a/protest/evals/types.py b/protest/evals/types.py index 141047b..0c628ab 100644 --- a/protest/evals/types.py +++ b/protest/evals/types.py @@ -309,7 +309,15 @@ def score_stats(self, name: str) -> ScoreStats: return ScoreStats.from_values(name, values) def all_score_stats(self) -> list[ScoreStats]: - return [self.score_stats(n) for n in sorted(self.score_names())] + # Single pass groups values by score name, avoiding O(n_cases x n_names) + # of calling score_stats(n) per name. score_stats(name) is preserved as + # a public single-name accessor. + by_name: dict[str, list[float]] = {} + for c in self.cases: + for s in c.scores: + if s.is_metric: + by_name.setdefault(s.name, []).append(float(s.value)) + return [ScoreStats.from_values(n, by_name[n]) for n in sorted(by_name)] @property def total_task_input_tokens(self) -> int: diff --git a/protest/evals/wrapper.py b/protest/evals/wrapper.py index 82b25a8..f6c074f 100644 --- a/protest/evals/wrapper.py +++ b/protest/evals/wrapper.py @@ -10,8 +10,9 @@ import asyncio import functools import time -from typing import Any +from typing import Annotated, Any, get_args, get_origin +from protest.di.hints import get_type_hints_compat from protest.entities.events import EvalPayload, EvalScoreEntry from protest.evals.evaluator import ( EvalCase, @@ -22,7 +23,7 @@ ) from protest.evals.hashing import compute_case_hash, compute_eval_hash from protest.evals.types import EvalScore, TaskResult -from protest.exceptions import FixtureError +from protest.exceptions import FixtureError, MultipleEvalCaseParamsError def make_eval_wrapper( @@ -32,6 +33,8 @@ def make_eval_wrapper( ) -> Any: """Wrap a function to run evaluators on its return value.""" + _validate_single_evalcase_param(func) + @functools.wraps(func) async def eval_wrapper(**kwargs: Any) -> EvalPayload: expected = _extract_expected(kwargs) @@ -102,6 +105,38 @@ async def eval_wrapper(**kwargs: Any) -> EvalPayload: return eval_wrapper +# --------------------------------------------------------------------------- +# Registration-time validation +# --------------------------------------------------------------------------- + + +def _validate_single_evalcase_param(func: Any) -> None: + """Raise MultipleEvalCaseParamsError if `func` has > 1 EvalCase parameter. + + Runs at decorator time. The runtime contract (`_find_case`) silently + picks the first EvalCase in kwargs, which would drop the second one's + name/expected/inputs/metadata/per-case evaluators downstream. We catch + that here so the failure is loud and pinpoints the offending eval. + + Subclasses of EvalCase count: the runtime uses isinstance(_, EvalCase), + so any subclass would trigger the same silent drop. + """ + hints = get_type_hints_compat(func) + offending: list[str] = [] + for param_name, annotation in hints.items(): + if param_name == "return": + continue + underlying = ( + get_args(annotation)[0] + if get_origin(annotation) is Annotated + else annotation + ) + if isinstance(underlying, type) and issubclass(underlying, EvalCase): + offending.append(param_name) + if len(offending) > 1: + raise MultipleEvalCaseParamsError(func.__name__, offending) + + # --------------------------------------------------------------------------- # Extract helpers — pull EvalCase from kwargs # --------------------------------------------------------------------------- diff --git a/protest/exceptions.py b/protest/exceptions.py index 8176c6f..42b5716 100644 --- a/protest/exceptions.py +++ b/protest/exceptions.py @@ -93,3 +93,23 @@ def __init__(self, value: int): f"max_concurrency must be >= 1, got {value}. " f"Use None for unlimited concurrency." ) + + +class MultipleEvalCaseParamsError(ProTestError): + """Raised when an eval function declares more than one EvalCase parameter. + + Only one EvalCase per eval is supported: it determines the case identity + (name, expected, inputs, metadata, per-case evaluators) used by the + runner, history, and reporters. Additional EvalCase parameters would be + silently ignored downstream. + """ + + def __init__(self, func_name: str, param_names: list[str]): + params = ", ".join(param_names) + super().__init__( + f"Eval '{func_name}' declares multiple EvalCase parameters: {params}. " + f"Only one EvalCase parameter is supported per eval — it is used " + f"for case identity (name), expected output, inputs, metadata, " + f"and per-case evaluators. Merge the cases into a single EvalCase, " + f"or split into separate evals." + ) diff --git a/tests/cli/test_run_command.py b/tests/cli/test_run_command.py index a56174d..878bd19 100644 --- a/tests/cli/test_run_command.py +++ b/tests/cli/test_run_command.py @@ -244,3 +244,34 @@ def test_suite_keyword_and_tag(self, run_protest: Callable[..., CLIResult]) -> N result.assert_success() expected_count = 1 assert f"{expected_count}/{expected_count} passed" in result.stdout + + +class TestRunRejectsEvalOnlyFlags: + """`--show-output` is eval-only and must not be accepted by `protest run`. + + The CLI parser is split: `protest run` builds a parser without eval-only + flags, so passing `--show-output` should raise an argparse error rather + than silently no-op (the previous behavior was a UX papercut: the flag + appeared in `protest run --help` but did nothing for non-eval tests). + """ + + def test_run_rejects_show_output( + self, run_protest: Callable[..., CLIResult] + ) -> None: + result = run_protest("run", "simple_session:session", "--show-output") + assert result.exit_code != 0, ( + f"Expected non-zero exit for `protest run --show-output`, " + f"got {result.exit_code}\nstdout: {result.stdout}\nstderr: {result.stderr}" + ) + assert "show-output" in result.stderr, ( + f"Expected argparse error mentioning 'show-output' in stderr, " + f"got: {result.stderr}" + ) + + def test_run_help_omits_show_output( + self, run_protest: Callable[..., CLIResult] + ) -> None: + result = run_protest("run", "--help") + assert "--show-output" not in result.stdout, ( + f"Expected --show-output absent from `protest run --help`:\n{result.stdout}" + ) diff --git a/tests/evals/test_multiple_evalcase_params.py b/tests/evals/test_multiple_evalcase_params.py new file mode 100644 index 0000000..3880811 --- /dev/null +++ b/tests/evals/test_multiple_evalcase_params.py @@ -0,0 +1,99 @@ +"""Tests for `_validate_single_evalcase_param` — D1 registration-time check. + +The runtime contract (`_find_case`) picks the first `EvalCase` in kwargs and +silently drops any others. The wrapper detects > 1 EvalCase param at +registration and raises a clear error pointing at the offending parameters. +""" + +from __future__ import annotations + +from typing import Annotated + +import pytest + +from protest import ForEach, From, ProTestSession +from protest.evals import EvalCase +from protest.evals.suite import EvalSuite +from protest.exceptions import MultipleEvalCaseParamsError + +# Module-level case sources so `get_type_hints()` can resolve Annotated args. +_cases_a = ForEach([EvalCase(inputs="a", name="a1")]) +_cases_b = ForEach([EvalCase(inputs="b", name="b1")]) + + +class _MyCase(EvalCase): + """Subclass to verify the check covers user-defined EvalCase types.""" + + +_subclass_cases = ForEach([_MyCase(inputs="x", name="x1")]) + + +class TestSingleEvalCaseParamAccepted: + def test_one_evalcase_param_via_annotated_from(self) -> None: + session = ProTestSession() + suite = EvalSuite("evals") + + @suite.eval() + def good(case: Annotated[EvalCase, From(_cases_a)]) -> str: + return str(case.inputs) + + _ = good + session.add_suite(suite) # no raise + + def test_zero_evalcase_param_accepted(self) -> None: + """Evals without parametrization (or without EvalCase) are valid.""" + session = ProTestSession() + suite = EvalSuite("evals") + + @suite.eval() + def no_case() -> str: + return "static" + + _ = no_case + session.add_suite(suite) # no raise + + def test_subclass_param_accepted_when_alone(self) -> None: + session = ProTestSession() + suite = EvalSuite("evals") + + @suite.eval() + def good(case: Annotated[_MyCase, From(_subclass_cases)]) -> str: + return str(case.inputs) + + _ = good + session.add_suite(suite) + + +class TestMultipleEvalCaseParamRejected: + def test_two_evalcase_params_raise(self) -> None: + suite = EvalSuite("evals") + + with pytest.raises(MultipleEvalCaseParamsError) as excinfo: + + @suite.eval() + def bad( + case_a: Annotated[EvalCase, From(_cases_a)], + case_b: Annotated[EvalCase, From(_cases_b)], + ) -> str: + return f"{case_a.inputs}+{case_b.inputs}" + + msg = str(excinfo.value) + assert "bad" in msg + assert "case_a" in msg + assert "case_b" in msg + + def test_subclass_counts_as_evalcase(self) -> None: + """A param typed `_MyCase` (subclass) collides with a `EvalCase` param.""" + suite = EvalSuite("evals") + + with pytest.raises(MultipleEvalCaseParamsError) as excinfo: + + @suite.eval() + def bad( + case_a: Annotated[EvalCase, From(_cases_a)], + case_b: Annotated[_MyCase, From(_subclass_cases)], + ) -> str: + return str(case_a.inputs) + str(case_b.inputs) + + assert "case_a" in str(excinfo.value) + assert "case_b" in str(excinfo.value) From 53d4813ba8c46e1974a1cb1c78bf6f97c0856e70 Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Sun, 26 Apr 2026 22:59:04 +0200 Subject: [PATCH 54/60] fix(executor): raise builtin TimeoutError to match Python 3.10 semantics asyncio.TimeoutError and builtins.TimeoutError were distinct classes before Python 3.11. Reporters and tests check isinstance against the builtin, so on 3.10 the previous `raise asyncio.TimeoutError(...)` made those checks fail. On 3.11+ both names alias the builtin, so this is a no-op. Fixes 6 timeout/retry tests on the 3.10 CI matrix. --- protest/core/execution/test_executor.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/protest/core/execution/test_executor.py b/protest/core/execution/test_executor.py index 3c065f2..10921b9 100644 --- a/protest/core/execution/test_executor.py +++ b/protest/core/execution/test_executor.py @@ -181,7 +181,11 @@ async def _run_test( # noqa: PLR0912 - complex test execution flow, refactoring timeout=item.timeout, ) except asyncio.TimeoutError: - raise asyncio.TimeoutError( + # Raise the builtin TimeoutError, not asyncio.TimeoutError. + # On Python 3.11+ they are aliases, but on 3.10 they are + # distinct classes and reporters/tests check isinstance + # against the builtin. + raise TimeoutError( f"Test exceeded timeout of {item.timeout}s" ) from None else: From 45643805bf40bb8f2017945460c64c0070db1246 Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Mon, 27 Apr 2026 00:27:49 +0200 Subject: [PATCH 55/60] fix(evals): tier-1 polish from naive-agent feedback Agent test (Claude Code in isolated dir, public docs only) surfaced several rough edges. This batch addresses the ones blocking a clean re-run signal: - ScoreNameCollisionError: dataclass evaluators with overlapping field names previously overwrote each other silently in the per-case scores dict (and the history file). Now raises at runtime with the case name and duplicate names; doc rewritten to remove the false auto-prefix promise. - ModelInfo -> ModelLabel: rename clarifies it is a passive history label, not a runtime model config (the doc warning becomes obsolete and is replaced by a plain description). - rich made truly optional: lazy-imported inside RichReporter methods so `import protest` works without rich; AsciiReporter.activate() takes over when rich is missing. Verified in a venv with no extras. - EvalSuite re-exported from protest.evals so users only need one import path for the eval API. - Top-level `protest --help` epilog now includes eval/history/live examples (was 9 run + 1 tags, none for eval/history/live). - cli.md gets full `protest eval` and `protest history` sections, including --compare's case-modified vs scoring-modified semantics. --- docs/cli.md | 163 +++++++++++++++++++++++ docs/evals.md | 55 +++++--- examples/yorkshire/evals/session.py | 4 +- examples/yorkshire/session.py | 4 +- protest/cli/main.py | 6 + protest/evals/__init__.py | 6 +- protest/evals/suite.py | 6 +- protest/evals/types.py | 2 +- protest/evals/wrapper.py | 18 ++- protest/exceptions.py | 26 ++++ protest/reporting/ascii.py | 6 +- protest/reporting/rich_reporter.py | 23 +++- tests/evals/test_e2e.py | 6 +- tests/evals/test_score_name_collision.py | 143 ++++++++++++++++++++ 14 files changed, 435 insertions(+), 33 deletions(-) create mode 100644 tests/evals/test_score_name_collision.py diff --git a/docs/cli.md b/docs/cli.md index 7495ae5..7d2b299 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -13,6 +13,8 @@ protest [options] | Command | Description | |---------|-------------| | `run` | Run tests | +| `eval` | Run evaluations | +| `history` | Browse run history (tests and evals) | | `live` | Start live reporter server | | `tags list` | List tags in a session | @@ -276,6 +278,167 @@ protest run tests:session --- +## protest eval + +Run evaluations from a session. + +`protest eval` is the eval-suite counterpart of `protest run`. It shares +the same target format, filters, capture flags and reporting options as +`run`; the differences are listed below. + +### Syntax + +```bash +protest eval [options] +``` + +### Options + +`protest eval` accepts every option from `protest run` (see above: +`-n/--concurrency`, `--collect-only`, `-x/--exitfirst`, `-s/--no-capture`, +`-q/--quiet`, `-v/--verbose`, `--show-logs`, `-t/--tag`, `--no-tag`, +`-k/--keyword`, `--lf`, `--cache-clear`, `--no-color`, `--ctrf-output`, +`--no-log-file`, `--app-dir`), plus one eval-only flag: + +| Option | Description | Default | +|--------|-------------|---------| +| `--show-output` | Print `inputs` / `output` / `expected` for **every** case (failed cases always print these). | off | + +### Examples + +```bash +# Run all evals in a session +protest eval evals.session:session + +# One specific suite +protest eval evals.session:session::helpdesk_struct + +# One ticket by name +protest eval evals.session:session -k T001 + +# All cases tagged "cat:hardware" +protest eval evals.session:session --tag cat:hardware + +# Re-run only the cases that failed last time +protest eval evals.session:session --lf + +# Show the input/output of every case (not just failures) +protest eval evals.session:session --show-output +``` + +### Output + +Each case prints one line: + +``` +✓ classify_ticket_struct[T011] (2ms) category_is_allowed=✓ summary_keyword_recall=1.00 … +``` + +After every suite, an aggregate-stats table summarizes the `Metric` +fields across cases (mean / p50 / p5 / p95). `Verdict` and `Reason` +fields don't appear in this table — only numeric `Metric` fields do. + +Per-case markdown artifacts are written to +`.protest/results/_/.md`, with the full +input, output, expected, and per-evaluator scores. + +--- + +## protest history + +Browse persisted run history (tests and evals). + +Every run appends one entry to `.protest/history.jsonl`; `protest history` +queries that file with various views. + +### Syntax + +```bash +protest history [view] [filters] +``` + +Exactly one view is shown at a time. The view defaults to a per-suite +trend table when no flag is given. + +### View flags (mutually exclusive) + +| Flag | Description | +|------|-------------| +| _(none)_ | Per-suite trend table: pass-rate trend + score arrows | +| `--runs` | Run-by-run pass rates, most recent first | +| `--show [N]` | Detailed panel for the Nth most recent run (`0` = latest, default) | +| `--compare` | Compare the two most recent runs of the same model | + +### Filters (apply to all views) + +| Flag | Description | Default | +|------|-------------|---------| +| `--tail N`, `-n N` | Limit to the N most recent entries | 10 | +| `--evals` | Show eval runs only | _all kinds_ | +| `--tests` | Show test runs only | _all kinds_ | +| `--model NAME` | Filter by `ModelLabel.name` | _all_ | +| `--suite NAME` | Filter by suite name | _all_ | +| `--clean-dirty` | Remove entries from runs made on a dirty working tree | off | +| `--path DIR` | Use a custom history directory | `.protest/` | + +### Reading `--compare` + +`--compare` reports four kinds of change between the two most recent +runs of the same model: + +| Marker | Label | Meaning | +|--------|-------|---------| +| `+` | Fixed | Case was failing in the previous run, passes now | +| `-` | Regressions | Case was passing in the previous run, fails now | +| `⟳` | Modified | Case is recognizable (same name) but its content changed | +| `*` | New | Case did not exist in the previous run | +| `✗` | Deleted | Case existed in the previous run, gone now | + +The `Modified` line tells you **what** changed by suffixing the case +name: + +- `T001 (case modified)` — `inputs` or `expected` changed (`case_hash` + diff) +- `T001 (scoring modified)` — only the evaluator configuration changed + (`eval_hash` diff). Inputs and expected output are intact; you've + edited an evaluator or its parameters. + +### Examples + +```bash +# Per-suite trend across last 10 runs (default view) +protest history --evals + +# Run-by-run breakdown of the last 5 eval runs +protest history --evals --runs --tail 5 + +# Detailed panel for the most recent run +protest history --evals --show + +# Detailed panel for the run before that (1 = next-most-recent) +protest history --evals --show 1 + +# Compare the two most recent runs +protest history --evals --compare + +# Filter to one model across all views +protest history --evals --model qwen-2.5 + +# Drop runs made on a dirty working tree before any view +protest history --evals --clean-dirty +``` + +### Notes + +- When the project is not a git repo, the per-run commit / dirty + columns display `?`. `--clean-dirty` is a no-op in that case. +- `--evals` and `--tests` are mutually exclusive; omit both to see + every kind. +- Per-case detail (input, output, expected, evaluator scores) lives in + `.protest/results/`, not in the history file. + +--- + ## protest live Start a persistent live reporter server for real-time test visualization. diff --git a/docs/evals.md b/docs/evals.md index fbff5bc..6bf899f 100644 --- a/docs/evals.md +++ b/docs/evals.md @@ -11,7 +11,7 @@ Evaluate LLM outputs with scored metrics and historical tracking. - [EvalCase](#evalcase) - [Evaluators](#evaluators) - [Fixtures](#fixtures) -- [ModelInfo](#modelinfo) +- [ModelLabel](#modelinfo) - [Judge](#judge) - [TaskResult (SUT Usage Tracking)](#taskresult-sut-usage-tracking) - [Usage Display](#usage-display) @@ -36,7 +36,7 @@ ProTest evals use the same infrastructure as tests: fixtures, DI, parallelism, t from typing import Annotated from protest import ForEach, From, ProTestSession -from protest.evals import EvalCase, ModelInfo, evaluator +from protest.evals import EvalCase, ModelLabel, evaluator from protest.evals.evaluators import contains_keywords from protest.evals.suite import EvalSuite @@ -47,7 +47,7 @@ cases = ForEach([ session = ProTestSession() -chatbot_suite = EvalSuite("chatbot", model=ModelInfo(name="gpt-4o-mini")) +chatbot_suite = EvalSuite("chatbot", model=ModelLabel(name="gpt-4o-mini")) session.add_suite(chatbot_suite) @chatbot_suite.eval(evaluators=[contains_keywords(keywords=["Marie"])]) @@ -77,9 +77,9 @@ The rest of the pipeline — fixtures, DI, parallelism, reporters — works iden ```python from protest.evals.suite import EvalSuite -from protest.evals import ModelInfo +from protest.evals import ModelLabel -chatbot_suite = EvalSuite("chatbot", model=ModelInfo(name="gpt-4o-mini")) +chatbot_suite = EvalSuite("chatbot", model=ModelLabel(name="gpt-4o-mini")) session.add_suite(chatbot_suite) @chatbot_suite.eval(evaluators=[my_scorer]) @@ -342,16 +342,12 @@ async def pipeline_eval( return await query(driver, case.inputs) ``` -## ModelInfo +## ModelLabel -!!! warning "ModelInfo does NOT configure a model" - - Despite the name, `ModelInfo` is a **passive label** for history tracking. It does not route requests, set a temperature, pick a provider, or otherwise touch any LLM. The actual model wiring happens inside *your* task function (or the agent / SDK it calls). `ModelInfo` exists solely so `protest history` can attribute results to a specific model and compare runs side-by-side. - -`ModelInfo` records which model produced the results so you can compare runs. +`ModelLabel` is a **passive label** that ProTest stores in the history alongside each run, so you can attribute results to a specific model and compare runs side-by-side. It does not route requests, set a temperature, pick a provider, or otherwise touch any LLM — the actual model wiring happens inside *your* task function (or the agent / SDK it calls). ```python -suite = EvalSuite("pipeline", model=ModelInfo(name="qwen-2.5")) +suite = EvalSuite("pipeline", model=ModelLabel(name="qwen-2.5")) ``` ## Judge @@ -406,7 +402,7 @@ return JudgeResponse(output=result.output) # tokens/cost = None, that's fine ```python suite = EvalSuite( "pipeline", - model=ModelInfo(name="qwen-2.5"), + model=ModelLabel(name="qwen-2.5"), judge=PydanticAIJudge(model="gpt-4o-mini", temperature=0), ) ``` @@ -497,7 +493,34 @@ If an evaluator raises an exception (e.g. LLM judge timeout), the case is marked ## Name Collisions -If two evaluators return dataclasses with the same field name (e.g. both have `accuracy`), the runner prefixes with the evaluator name when it detects a conflict: `llm_judge.accuracy`, `fact_check.accuracy`. +Each `Verdict` / `Metric` / `Reason` field name from a dataclass evaluator +becomes a key in the per-case score dict (and in the history file). **Names +must be unique across all evaluators that run on the same case.** + +If two evaluators emit a score under the same name (e.g. both have a +`detail` field), ProTest raises `ScoreNameCollisionError` at runtime so the +collision is loud instead of silently overwriting the duplicate. Rename the +colliding field — typically by prefixing with the evaluator's concept: + +```python +@dataclass +class SummaryShape: + summary_well_formed: Annotated[bool, Verdict] + summary_detail: Annotated[str, Reason] = "" # not just "detail" + +@dataclass +class CategoryMatch: + category_matches: Annotated[bool, Verdict] + category_match_detail: Annotated[str, Reason] = "" # not just "detail" +``` + +Why no auto-prefix? An evaluator's score name is what users grep for in +history, scripts, and the markdown artifacts. Auto-prefixing would mean the +same evaluator's `accuracy` field changes name (`fact_check.accuracy` vs +plain `accuracy`) depending on which other evaluators are wired in alongside +it — silently breaking downstream consumers when a new evaluator is added. +Failing loud and asking you to pick a stable, unique name keeps the score +identifiers stable across configurations. ## Multi-Model Sessions @@ -506,8 +529,8 @@ Track which model produced each eval suite's results. Each `EvalSuite` can have ```python session = ProTestSession() -pipeline_suite = EvalSuite("pipeline", model=ModelInfo(name="qwen-2.5")) -chatbot_suite = EvalSuite("chatbot", model=ModelInfo(name="mistral-7b")) +pipeline_suite = EvalSuite("pipeline", model=ModelLabel(name="qwen-2.5")) +chatbot_suite = EvalSuite("chatbot", model=ModelLabel(name="mistral-7b")) session.add_suite(pipeline_suite) session.add_suite(chatbot_suite) diff --git a/examples/yorkshire/evals/session.py b/examples/yorkshire/evals/session.py index f03d733..06d9b3f 100644 --- a/examples/yorkshire/evals/session.py +++ b/examples/yorkshire/evals/session.py @@ -16,7 +16,7 @@ yorkshire_cases, ) from protest import From, ProTestSession -from protest.evals import EvalCase, ModelInfo +from protest.evals import EvalCase, ModelLabel from protest.evals.suite import EvalSuite session = ProTestSession( @@ -25,7 +25,7 @@ yorkshire_suite = EvalSuite( "yorkshire_eval", - model=ModelInfo(name="yorkshire-chatbot-v1", provider="local"), + model=ModelLabel(name="yorkshire-chatbot-v1", provider="local"), ) session.add_suite(yorkshire_suite) diff --git a/examples/yorkshire/session.py b/examples/yorkshire/session.py index b723cb9..c4ffeb0 100644 --- a/examples/yorkshire/session.py +++ b/examples/yorkshire/session.py @@ -29,7 +29,7 @@ from examples.yorkshire.tests.suites.seniors.suite import seniors_suite from examples.yorkshire.tests.suites.showcase.suite import showcase_suite from protest import From, ProTestSession -from protest.evals import EvalCase, ModelInfo +from protest.evals import EvalCase, ModelLabel from protest.evals.suite import EvalSuite session = ProTestSession(concurrency=4, history=True) @@ -48,7 +48,7 @@ yorkshire_suite = EvalSuite( "yorkshire_eval", - model=ModelInfo(name="yorkshire-chatbot-v1", provider="local"), + model=ModelLabel(name="yorkshire-chatbot-v1", provider="local"), ) session.add_suite(yorkshire_suite) diff --git a/protest/cli/main.py b/protest/cli/main.py index 16e76ee..8bb2fe8 100644 --- a/protest/cli/main.py +++ b/protest/cli/main.py @@ -24,6 +24,12 @@ protest run demo:session --collect-only List tests without running protest run demo:session --tag slow Run tests with 'slow' tag protest run demo:session -s Disable capture (show print output) + protest eval demo:session Run all evaluations + protest eval demo:session --show-output Show inputs/output/expected per case + protest history --evals Show eval suite trends + protest history --evals --tail 5 Show last 5 entries + protest history --evals --compare Compare 2 most recent runs + protest live Start live reporter server protest tags list demo:session List all available tags """ diff --git a/protest/evals/__init__.py b/protest/evals/__init__.py index c985114..9882d7f 100644 --- a/protest/evals/__init__.py +++ b/protest/evals/__init__.py @@ -10,6 +10,7 @@ Verdict, evaluator, ) +from protest.evals.suite import EvalSuite from protest.evals.types import ( EvalCaseResult, EvalScore, @@ -17,7 +18,7 @@ Judge, JudgeInfo, JudgeResponse, - ModelInfo, + ModelLabel, ScoreStats, TaskResult, ) @@ -27,13 +28,14 @@ "EvalCaseResult", "EvalContext", "EvalScore", + "EvalSuite", "EvalSuiteReport", "Evaluator", "Judge", "JudgeInfo", "JudgeResponse", "Metric", - "ModelInfo", + "ModelLabel", "Reason", "ScoreStats", "ShortCircuit", diff --git a/protest/evals/suite.py b/protest/evals/suite.py index c4af124..4971e17 100644 --- a/protest/evals/suite.py +++ b/protest/evals/suite.py @@ -11,7 +11,7 @@ if TYPE_CHECKING: from collections.abc import Callable - from protest.evals.types import Judge, ModelInfo + from protest.evals.types import Judge, ModelLabel FuncT = TypeVar("FuncT", bound="Callable[..., object]") @@ -33,7 +33,7 @@ def __init__( self, name: str, *, - model: ModelInfo | None = None, + model: ModelLabel | None = None, judge: Judge | None = None, tags: list[str] | None = None, max_concurrency: int | None = None, @@ -60,7 +60,7 @@ def judge(self) -> Judge | None: return self._judge @property - def model(self) -> ModelInfo | None: + def model(self) -> ModelLabel | None: return self._model def eval( diff --git a/protest/evals/types.py b/protest/evals/types.py index 0c628ab..1d19474 100644 --- a/protest/evals/types.py +++ b/protest/evals/types.py @@ -95,7 +95,7 @@ async def judge(self, prompt: str, output_type: type[T]) -> JudgeResponse[T]: .. @dataclass(frozen=True, slots=True) -class ModelInfo: +class ModelLabel: """Metadata about the model being evaluated.""" name: str diff --git a/protest/evals/wrapper.py b/protest/evals/wrapper.py index f6c074f..9601a7c 100644 --- a/protest/evals/wrapper.py +++ b/protest/evals/wrapper.py @@ -23,7 +23,11 @@ ) from protest.evals.hashing import compute_case_hash, compute_eval_hash from protest.evals.types import EvalScore, TaskResult -from protest.exceptions import FixtureError, MultipleEvalCaseParamsError +from protest.exceptions import ( + FixtureError, + MultipleEvalCaseParamsError, + ScoreNameCollisionError, +) def make_eval_wrapper( @@ -76,6 +80,18 @@ async def eval_wrapper(**kwargs: Any) -> EvalPayload: judge=judge, ) + # Detect score-name collisions across evaluators. EvalPayload.scores + # is a dict keyed by name; duplicates would silently overwrite each + # other downstream. Fail loud so the user can rename the field. + seen: set[str] = set() + duplicates: list[str] = [] + for s in scores: + if s.name in seen and s.name not in duplicates: + duplicates.append(s.name) + seen.add(s.name) + if duplicates: + raise ScoreNameCollisionError(case_name, duplicates) + return EvalPayload( case_name=case_name, passed=all(s.passed for s in scores), diff --git a/protest/exceptions.py b/protest/exceptions.py index 42b5716..3cff676 100644 --- a/protest/exceptions.py +++ b/protest/exceptions.py @@ -113,3 +113,29 @@ def __init__(self, func_name: str, param_names: list[str]): f"and per-case evaluators. Merge the cases into a single EvalCase, " f"or split into separate evals." ) + + +class ScoreNameCollisionError(ProTestError): + """Raised when two evaluators in the same eval emit scores with the same name. + + Each `EvalScore.name` (from a dataclass `Verdict`/`Metric`/`Reason` field + or from the evaluator's name when it returns `bool`) becomes a key in + `EvalPayload.scores` (a dict). If two evaluators emit the same name, + one would silently overwrite the other in the per-case report and history, + which is a real source of misleading data. + + Fix by renaming the colliding fields so each Verdict/Metric/Reason has a + unique name within the suite (e.g. prefix with the evaluator's concept: + `summary_detail` instead of just `detail`). + """ + + def __init__(self, case_name: str, duplicates: list[str]): + dup_str = ", ".join(repr(d) for d in sorted(duplicates)) + super().__init__( + f"Score-name collision in eval '{case_name}': {dup_str}. " + f"Two or more evaluators emit a score under the same name. " + f"Rename the colliding dataclass Verdict/Metric/Reason field(s) " + f"so each name is unique within the suite — otherwise the " + f"duplicate scores would silently overwrite each other in the " + f"per-case report and the history file." + ) diff --git a/protest/reporting/ascii.py b/protest/reporting/ascii.py index 2233a1c..446c083 100644 --- a/protest/reporting/ascii.py +++ b/protest/reporting/ascii.py @@ -98,7 +98,11 @@ def __init__( @classmethod def activate(cls, ctx: PluginContext) -> Self | None: - if ctx.get("no_color", False): + # Activate when --no-color was passed, OR when `rich` is not + # installed (RichReporter would otherwise leave the run silent). + import importlib.util # noqa: PLC0415 — std lib, kept local for clarity + + if ctx.get("no_color", False) or importlib.util.find_spec("rich") is None: return cls( verbosity=ctx.get("verbosity", 0), show_logs=ctx.get("show_logs"), diff --git a/protest/reporting/rich_reporter.py b/protest/reporting/rich_reporter.py index e30584c..22622c6 100644 --- a/protest/reporting/rich_reporter.py +++ b/protest/reporting/rich_reporter.py @@ -1,11 +1,10 @@ +import importlib.util import logging import traceback from argparse import ArgumentParser from pathlib import Path from typing import Any -from rich.console import Console -from rich.table import Table from typing_extensions import Self from protest.entities import ( @@ -34,6 +33,15 @@ ) from protest.reporting.verbosity import Verbosity + +# `rich` is an optional dependency. All `from rich...` imports below are +# done lazily inside methods so that `import protest` works without it; +# `RichReporter.activate()` returns None when rich is missing, and +# `AsciiReporter` takes over via its own activate() check. +def _rich_available() -> bool: + return importlib.util.find_spec("rich") is not None + + # Per-run pass-rate thresholds for the eval suite color cue. # Strict default — green only if every case passes; yellow above half. _PERFECT_PASS_RATE = 1.0 @@ -84,6 +92,8 @@ def __init__( show_logs: str | None = None, show_output: bool = False, ) -> None: + from rich.console import Console # noqa: PLC0415 — optional dep, lazy + self.console = Console(highlight=False) self._verbosity = verbosity self._show_logs = show_logs @@ -112,6 +122,9 @@ def add_cli_options(cls, parser: ArgumentParser) -> None: def activate(cls, ctx: PluginContext) -> Self | None: if ctx.get("no_color", False): return None + if not _rich_available(): + # `rich` is an optional dependency; AsciiReporter takes over. + return None return cls( verbosity=ctx.get("verbosity", 0), show_logs=ctx.get("show_logs"), @@ -157,6 +170,8 @@ def _maybe_show_logs(self, result: TestResult) -> None: def _print_bypass(self, message: str) -> None: """Print bypassing capture (for lifecycle messages emitted during tests).""" + from rich.console import Console # noqa: PLC0415 — optional dep, lazy + stream = real_stdout() Console(file=stream, highlight=False).print(message) @@ -380,6 +395,8 @@ def _print_failure_detail(self, result: TestResult, *, is_error: bool) -> None: self._print(f"[dim]{escaped_line}[/]") def on_user_print(self, data: Any) -> None: + from rich.console import Console # noqa: PLC0415 — optional dep, lazy + msg, raw, prefix = data # Write to the real stdout, bypassing capture stream = real_stdout() @@ -394,6 +411,8 @@ def on_user_print(self, data: Any) -> None: def on_eval_suite_end(self, report: Any) -> None: if not isinstance(report, EvalSuiteReport): return + from rich.table import Table # noqa: PLC0415 — optional dep, lazy + stats = report.all_score_stats() self._print("") if stats: diff --git a/tests/evals/test_e2e.py b/tests/evals/test_e2e.py index 5e86c18..7daf058 100644 --- a/tests/evals/test_e2e.py +++ b/tests/evals/test_e2e.py @@ -28,7 +28,7 @@ EvalCase, EvalContext, Metric, - ModelInfo, + ModelLabel, ShortCircuit, Verdict, evaluator, @@ -121,7 +121,7 @@ def eval_echo(case: Annotated[EvalCase, From(basic_cases)]) -> str: assert any(s.kind == "eval" for s in session._suites) def test_model_set_via_suite(self) -> None: - suite = EvalSuite("eval_echo", model=ModelInfo(name="test-model")) + suite = EvalSuite("eval_echo", model=ModelLabel(name="test-model")) assert suite._model is not None assert suite._model.name == "test-model" @@ -523,7 +523,7 @@ class TestHistory: def _run_eval(self, tmp_path: Path) -> None: session = ProTestSession(history_dir=tmp_path) - eval_echo_suite = EvalSuite("eval_echo", model=ModelInfo(name="test-model")) + eval_echo_suite = EvalSuite("eval_echo", model=ModelLabel(name="test-model")) session.add_suite(eval_echo_suite) @eval_echo_suite.eval(evaluators=[fake_accuracy]) diff --git a/tests/evals/test_score_name_collision.py b/tests/evals/test_score_name_collision.py new file mode 100644 index 0000000..f5d4c48 --- /dev/null +++ b/tests/evals/test_score_name_collision.py @@ -0,0 +1,143 @@ +"""Tests for `ScoreNameCollisionError` — fail-loud on duplicate score names. + +Two evaluators emitting a score under the same name (e.g. both have a +``detail`` field on their dataclass return) would silently overwrite each +other in ``EvalPayload.scores`` (a dict). The wrapper detects the +collision at runtime and raises a clear error pointing at the duplicate +name(s) so the user can rename the colliding field. +""" + +from __future__ import annotations + +import asyncio +from dataclasses import dataclass +from typing import Annotated + +import pytest + +from protest import ForEach, From, ProTestSession +from protest.evals import ( + EvalCase, + EvalContext, + EvalSuite, + Reason, + Verdict, + evaluator, +) +from protest.evals.wrapper import make_eval_wrapper +from protest.exceptions import ScoreNameCollisionError + +_cases = ForEach([EvalCase(inputs="x", name="c1")]) + + +@dataclass +class _ShapeA: + matches: Annotated[bool, Verdict] + detail: Annotated[str, Reason] = "" + + +@dataclass +class _ShapeB: + other_check: Annotated[bool, Verdict] + detail: Annotated[str, Reason] = "" # collides with _ShapeA.detail + + +@evaluator +def _shape_a(ctx: EvalContext) -> _ShapeA: + return _ShapeA(matches=True, detail="from A") + + +@evaluator +def _shape_b(ctx: EvalContext) -> _ShapeB: + return _ShapeB(other_check=True, detail="from B") + + +@evaluator +def _bool_one(ctx: EvalContext) -> bool: + return True + + +@dataclass +class _ShapeWithBoolOneField: + _bool_one: Annotated[bool, Verdict] # collides with _bool_one evaluator's name + + +@evaluator +def _shape_collides_with_bool(ctx: EvalContext) -> _ShapeWithBoolOneField: + return _ShapeWithBoolOneField(_bool_one=True) + + +@dataclass +class _ShapeUniqueA: + matches_a: Annotated[bool, Verdict] + detail_a: Annotated[str, Reason] = "" + + +@dataclass +class _ShapeUniqueB: + matches_b: Annotated[bool, Verdict] + detail_b: Annotated[str, Reason] = "" + + +@evaluator +def _shape_unique_a(ctx: EvalContext) -> _ShapeUniqueA: + return _ShapeUniqueA(matches_a=True, detail_a="A") + + +@evaluator +def _shape_unique_b(ctx: EvalContext) -> _ShapeUniqueB: + return _ShapeUniqueB(matches_b=True, detail_b="B") + + +def _invoke(evaluators: list, case: EvalCase) -> None: + """Invoke the eval wrapper directly so collision exceptions propagate.""" + + def task(case: EvalCase) -> str: + return str(case.inputs) + + wrapped = make_eval_wrapper(task, evaluators) + asyncio.run(wrapped(case=case)) + + +class TestCollisionRaises: + def test_two_dataclasses_share_field_name(self) -> None: + with pytest.raises(ScoreNameCollisionError) as excinfo: + _invoke([_shape_a, _shape_b], EvalCase(inputs="x", name="c1")) + msg = str(excinfo.value) + assert "'detail'" in msg + assert "c1" in msg + + def test_bool_evaluator_name_collides_with_dataclass_field(self) -> None: + with pytest.raises(ScoreNameCollisionError) as excinfo: + _invoke( + [_bool_one, _shape_collides_with_bool], + EvalCase(inputs="x", name="c2"), + ) + msg = str(excinfo.value) + assert "_bool_one" in msg + assert "c2" in msg + + +class TestNoCollisionPasses: + def test_unique_names_pass(self) -> None: + # Should not raise. + _invoke( + [_shape_unique_a, _shape_unique_b], + EvalCase(inputs="x", name="c1"), + ) + + def test_session_with_unique_names_runs_clean(self) -> None: + """Smoke check: running through the full session path also succeeds.""" + from protest.api import run_session # noqa: PLC0415 — heavy import + + session = ProTestSession() + suite = EvalSuite("evals") + + @suite.eval(evaluators=[_shape_unique_a, _shape_unique_b]) + def ok(case: Annotated[EvalCase, From(_cases)]) -> str: + return str(case.inputs) + + _ = ok + session.add_suite(suite) + result = run_session(session) + assert result.success From 3d1fe488e3bd457e3eb704ad7233edb89c326545 Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Mon, 27 Apr 2026 09:00:44 +0200 Subject: [PATCH 56/60] fix(history,cli,docs): tier-2 polish from naive-agent v2 feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Agent v2 confirmed the tier-1 fixes landed cleanly and surfaced a new bucket of frictions concentrated on `protest history`. This batch addresses them. CLI refactor: - `protest history` is now sub-command based (`list`, `runs`, `show`, `compare`, `clean`) instead of mutually-exclusive flags. `list` remains the implicit default so `protest history --tail 5` still works without typing the sub-command. The previous flag-as-mode form (`--runs`, `--show`, `--compare`, `--clean-dirty`) is removed. - `protest history clean` is dry-run by default. `--apply` actually modifies the file. Eliminates the "destructive without warning" footgun. - `--model` and `--suite` filter at the suite level: a run with several suites under different models keeps the entry, with non- matching suites pruned out of the displayed view. The previous run-level filter would surprise users by dropping the whole run. - `--tail N` now narrows the entries before aggregation, so the `list` (trend) view actually scopes to the requested window. - Added `--short` for `protest eval`: hide passing scores per case to keep the output readable on suites with many evaluators. Docs: - `cli.md` rewritten for the new sub-command layout, with explicit examples for each sub-command and a note on suite-level filtering. - `evals.md` gets a callout on writing custom evaluators when the eval task returns a non-string output (dict / dataclass / pydantic), and a tip clarifying that "first run successful" doesn't mean every case passes — evals are expected to surface failing cases. - `evals.md` quick-start now imports `EvalSuite` from `protest.evals` (single canonical path). - `installation.md` adds an IDE / type-checker setup section (Pyright/Pylance/mypy + uv). Storage: - `is_dirty_entry()` and `count_dirty_entries()` extracted as helpers so the dry-run path can compute counts without touching the file. The remaining cross-suite/cross-model `compare` ask is tracked in #101. --- docs/cli.md | 62 +++++++----- docs/evals.md | 39 ++++++- docs/getting-started/installation.md | 26 +++++ protest/cli/history.py | 145 ++++++++++++++++++--------- protest/cli/main.py | 6 ++ protest/history/storage.py | 112 +++++++++++++++++---- protest/reporting/ascii.py | 19 +++- protest/reporting/rich_reporter.py | 19 +++- tests/test_history_cli.py | 133 +++++++++++++----------- 9 files changed, 401 insertions(+), 160 deletions(-) diff --git a/docs/cli.md b/docs/cli.md index 7d2b299..910701d 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -349,38 +349,43 @@ input, output, expected, and per-evaluator scores. Browse persisted run history (tests and evals). Every run appends one entry to `.protest/history.jsonl`; `protest history` -queries that file with various views. +queries that file via sub-commands. ### Syntax ```bash -protest history [view] [filters] +protest history [filters] ``` -Exactly one view is shown at a time. The view defaults to a per-suite -trend table when no flag is given. +If no sub-command is given, `list` runs by default — so +`protest history --tail 5` is equivalent to +`protest history list --tail 5`. -### View flags (mutually exclusive) +### Sub-commands -| Flag | Description | -|------|-------------| -| _(none)_ | Per-suite trend table: pass-rate trend + score arrows | -| `--runs` | Run-by-run pass rates, most recent first | -| `--show [N]` | Detailed panel for the Nth most recent run (`0` = latest, default) | -| `--compare` | Compare the two most recent runs of the same model | +| Sub-command | Description | +|-------------|-------------| +| `list` | Per-suite trend table: pass-rate trend + score arrows. **Default** when no sub-command is given. | +| `runs` | Run-by-run pass rates, most recent first. | +| `show [N]` | Detailed panel for the Nth most recent run (`N=0` = latest, the default). | +| `compare` | Compare the two most recent runs of the same model. | +| `clean` | Remove entries from runs made on a dirty working tree. **Dry-run by default** — pass `--apply` to actually modify the file. | -### Filters (apply to all views) +### Filters (shared by every sub-command) | Flag | Description | Default | |------|-------------|---------| | `--tail N`, `-n N` | Limit to the N most recent entries | 10 | | `--evals` | Show eval runs only | _all kinds_ | | `--tests` | Show test runs only | _all kinds_ | -| `--model NAME` | Filter by `ModelLabel.name` | _all_ | -| `--suite NAME` | Filter by suite name | _all_ | -| `--clean-dirty` | Remove entries from runs made on a dirty working tree | off | +| `--model NAME` | Keep only suites whose `ModelLabel.name` matches | _all_ | +| `--suite NAME` | Keep only the suite with this name | _all_ | | `--path DIR` | Use a custom history directory | `.protest/` | +`--model` and `--suite` filter at the **suite level**: a run that +contains *several* suites with different models keeps the entry alive, +with non-matching suites pruned out of the displayed view. + ### Reading `--compare` `--compare` reports four kinds of change between the two most recent @@ -406,32 +411,35 @@ name: ### Examples ```bash -# Per-suite trend across last 10 runs (default view) +# Per-suite trend across last 10 eval runs (default sub-command: list) protest history --evals # Run-by-run breakdown of the last 5 eval runs -protest history --evals --runs --tail 5 +protest history runs --evals --tail 5 -# Detailed panel for the most recent run -protest history --evals --show +# Detailed panel for the most recent eval run +protest history show --evals # Detailed panel for the run before that (1 = next-most-recent) -protest history --evals --show 1 +protest history show 1 --evals + +# Compare the two most recent runs of the same model +protest history compare --evals -# Compare the two most recent runs -protest history --evals --compare +# Filter to one model — only suites with this model are shown +protest history list --evals --model qwen-2.5 -# Filter to one model across all views -protest history --evals --model qwen-2.5 +# Preview which entries `clean` would remove (no file changes) +protest history clean --evals -# Drop runs made on a dirty working tree before any view -protest history --evals --clean-dirty +# Actually remove dirty entries +protest history clean --apply ``` ### Notes - When the project is not a git repo, the per-run commit / dirty - columns display `?`. `--clean-dirty` is a no-op in that case. + columns display `?`. `clean` is a no-op in that case. - `--evals` and `--tests` are mutually exclusive; omit both to see every kind. - Per-case detail (input, output, expected, evaluator scores) lives in diff --git a/docs/evals.md b/docs/evals.md index 6bf899f..f1bb40e 100644 --- a/docs/evals.md +++ b/docs/evals.md @@ -29,6 +29,19 @@ A test produces **pass/fail**. An eval produces **scores** — numeric values (0 ProTest evals use the same infrastructure as tests: fixtures, DI, parallelism, tags. An eval is a test that returns a value, scored by evaluators. +!!! tip "First-run expectations: don't expect 100% green" + + Unlike tests, evals are **expected to have failing cases** — that's + the signal you're measuring. `protest eval` still exits 1 when any + case fails a `Verdict` (so CI surfaces regressions), but the + failures are not bugs, they're data points. The aggregate-stats + table and `protest history` are designed for this — you watch the + metrics drift over time, and use `--compare` to flag actual + regressions between runs. If you want a CI gate that only fails on + infrastructure errors (fixture / evaluator crashes) and not on + case-level scoring, run `protest eval || true` followed by + `protest history --compare` to assert no regression. + ## Quick Start ```python @@ -38,7 +51,7 @@ from typing import Annotated from protest import ForEach, From, ProTestSession from protest.evals import EvalCase, ModelLabel, evaluator from protest.evals.evaluators import contains_keywords -from protest.evals.suite import EvalSuite +from protest.evals import EvalSuite cases = ForEach([ EvalCase(inputs="Who is Marie?", expected="Marie, Resistance", name="lookup"), @@ -76,7 +89,7 @@ The rest of the pipeline — fixtures, DI, parallelism, reporters — works iden `EvalSuite` groups eval cases. It's the eval equivalent of `ProTestSuite` — it forces `kind=EVAL` and carries model/judge configuration. Model and judge are suite-level config: each suite declares which model produced its results and which judge scores them. ```python -from protest.evals.suite import EvalSuite +from protest.evals import EvalSuite from protest.evals import ModelLabel chatbot_suite = EvalSuite("chatbot", model=ModelLabel(name="gpt-4o-mini")) @@ -135,6 +148,28 @@ protest eval evals.session:session --no-tag slow An evaluator is a function decorated with `@evaluator` that receives an `EvalContext` and returns a verdict. +!!! info "If your eval task returns a non-string output" + + The built-in evaluators (`contains_keywords`, `not_empty`, `max_length`, + `matches_regex`, `json_valid`, `word_overlap`) assume `ctx.output` is a + string and call methods like `.lower()` on it. They drop in cleanly for + summarization, chatbot replies, single-string completions, etc. + + For a structured output (`dict`, `dataclass`, `pydantic.BaseModel`, list + of objects, …), the path is to write **custom evaluators** that + pick the field they care about. A typical pattern: + + ```python + @evaluator + def category_matches_expected(ctx: EvalContext) -> CategoryMatch: + expected = (ctx.expected_output or {}).get("category") + actual = ctx.output.get("category") + return CategoryMatch(category_matches=(expected == actual), ...) + ``` + + See *Structured Evaluator* below and *EvalContext* for the data + you can read off `ctx`. + ### Return Types Evaluators return `bool` (simple verdict) or a `dataclass` (structured result). In dataclasses, annotate fields to tell the framework what each one is: diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md index e885d05..05f5cd6 100644 --- a/docs/getting-started/installation.md +++ b/docs/getting-started/installation.md @@ -31,3 +31,29 @@ ProTest automatically uses [Rich](https://rich.readthedocs.io/) for better termi ```bash uv add rich ``` + +## IDE / type checker setup + +ProTest ships a `py.typed` marker, so Pyright, mypy and Pylance pick up +its type hints once it is installed in the project's virtual env. + +If your editor reports `Import "protest" could not be resolved`, point +your type checker at the right interpreter: + +- **VS Code / Pylance**: open the command palette → *Python: Select + Interpreter* → choose `.venv/bin/python` (the one `uv` created). +- **Pyright (CLI/standalone)**: add a `pyrightconfig.json` next to your + `pyproject.toml`: + + ```json + { + "venvPath": ".", + "venv": ".venv" + } + ``` + +- **mypy**: run via `uv run mypy ...` so it inherits the same + interpreter, or set `python_executable` in `mypy.ini`. + +Once configured, no extra stub package or plugin is needed — protest +exposes its own types directly. diff --git a/protest/cli/history.py b/protest/cli/history.py index 88c94b8..50cf34d 100644 --- a/protest/cli/history.py +++ b/protest/cli/history.py @@ -7,59 +7,84 @@ from pathlib import Path from typing import Any -from protest.history.storage import clean_dirty, load_history +from protest.history.storage import clean_dirty, count_dirty_entries, load_history -def handle_history_command(argv: list[str]) -> None: - """Entry point for `protest history`.""" - parser = argparse.ArgumentParser( - prog="protest history", description="Browse run history" +def _make_common_parser() -> argparse.ArgumentParser: + """Filters shared by every `protest history` sub-command.""" + common = argparse.ArgumentParser(add_help=False) + common.add_argument( + "--tail", + "-n", + type=int, + default=10, + help="Limit to the N most recent entries (default: 10)", ) - parser.add_argument( - "--tail", "-n", type=int, default=10, help="Number of entries (default: 10)" + common.add_argument("--model", type=str, default=None, help="Filter by model name") + common.add_argument("--suite", type=str, default=None, help="Filter by suite name") + kind_group = common.add_mutually_exclusive_group() + kind_group.add_argument("--evals", action="store_true", help="Eval runs only") + kind_group.add_argument("--tests", action="store_true", help="Test runs only") + common.add_argument( + "--path", + type=str, + default=None, + help="History directory (default: .protest/)", ) - parser.add_argument("--model", type=str, default=None, help="Filter by model name") - parser.add_argument("--suite", type=str, default=None, help="Filter by suite name") + return common + + +def handle_history_command(argv: list[str]) -> None: + """Entry point for `protest history`. + + Sub-commands: - action_group = parser.add_mutually_exclusive_group() - action_group.add_argument( - "--runs", action="store_true", help="Show run-by-run list" + - ``list`` (default): per-suite trend table. + - ``runs``: run-by-run pass rates, most recent first. + - ``show [N]``: detailed panel for the Nth most recent run (0=latest). + - ``compare``: compare the two most recent runs. + - ``clean``: remove entries from runs made on a dirty working tree + (dry-run by default; pass ``--apply`` to actually modify the file). + """ + parser = argparse.ArgumentParser( + prog="protest history", + description="Browse run history", ) - action_group.add_argument( - "--show", - nargs="?", - const=0, + sub = parser.add_subparsers(dest="action") + common = _make_common_parser() + + sub.add_parser("list", parents=[common], help="Per-suite trend (default)") + sub.add_parser("runs", parents=[common], help="Run-by-run breakdown") + show_p = sub.add_parser("show", parents=[common], help="Detailed panel for one run") + show_p.add_argument( + "nth", type=int, - default=None, - metavar="N", - help="Detailed panel for Nth most recent run (0=latest)", - ) - action_group.add_argument( - "--compare", action="store_true", help="Compare 2 most recent runs" + nargs="?", + default=0, + help="Nth most recent run (0=latest, default: 0)", ) - - kind_group = parser.add_mutually_exclusive_group() - kind_group.add_argument("--evals", action="store_true", help="Eval runs only") - kind_group.add_argument("--tests", action="store_true", help="Test runs only") - parser.add_argument( - "--clean-dirty", + sub.add_parser("compare", parents=[common], help="Compare 2 most recent runs") + clean_p = sub.add_parser("clean", parents=[common], help="Remove dirty entries") + clean_p.add_argument( + "--apply", action="store_true", - help="Remove runs with uncommitted changes on current commit.", - ) - parser.add_argument( - "--path", type=str, default=None, help="History directory (default: .protest/)" + help="Actually modify the history file (default: dry-run, no changes).", ) + # Default to `list` when no sub-command is given (so users can still + # write `protest history --tail 5` without typing `list`). + # `--help` / `-h` go to the parent so users see the sub-command list, + # not list-specific options. + if not argv: + argv = ["list"] + elif argv[0].startswith("-") and argv[0] not in ("--help", "-h"): + argv = ["list", *argv] args = parser.parse_args(argv) + history_dir = Path(args.path) if args.path else None - if args.clean_dirty: - removed = clean_dirty(history_dir=history_dir) - print( - f"Removed {removed} dirty entries." - if removed - else "No dirty entries to clean." - ) + if args.action == "clean": + _run_clean(history_dir=history_dir, apply=args.apply) sys.exit(0) entries = load_history( @@ -73,21 +98,47 @@ def handle_history_command(argv: list[str]) -> None: print("No history found.") sys.exit(0) + # Apply --tail to entries before any aggregation so the trend view + # actually narrows to the requested window (otherwise the per-suite + # trend would still cover the full file even with --tail). + entries = entries[-args.tail :] + _dispatch_view(args.action, getattr(args, "nth", 0), entries) + + +def _run_clean(history_dir: Path | None, *, apply: bool) -> None: + if apply: + removed = clean_dirty(history_dir=history_dir) + print( + f"Removed {removed} dirty entries." + if removed + else "No dirty entries to clean." + ) + return + count = count_dirty_entries(history_dir=history_dir) + if count: + print( + f"Would remove {count} dirty entries. " + f"Re-run with --apply to actually modify the history file." + ) + else: + print("No dirty entries to clean.") + + +def _dispatch_view(action: str, nth: int, entries: list[dict[str, Any]]) -> None: out = _get_output() - if args.compare: + if action == "compare": if len(entries) < 2: print("Need at least 2 runs to compare.") sys.exit(1) out.compare(entries[-1], entries[-2]) - elif args.show is not None: - idx = args.show - if idx >= len(entries): + elif action == "show": + if nth >= len(entries): print(f"Only {len(entries)} entries available.") sys.exit(1) - out.detail(entries[-(idx + 1)]) - elif args.runs: - out.runs(entries[-args.tail :]) - else: + out.detail(entries[-(nth + 1)]) + elif action == "runs": + out.runs(entries) + else: # "list" (default) out.stats(entries) diff --git a/protest/cli/main.py b/protest/cli/main.py index 8bb2fe8..4aaab5f 100644 --- a/protest/cli/main.py +++ b/protest/cli/main.py @@ -261,6 +261,12 @@ def _create_run_parser( action="store_true", help="Show eval inputs/output/expected per case", ) + parser.add_argument( + "--short", + dest="short", + action="store_true", + help="Compact eval output: only print scores that failed per case", + ) return parser diff --git a/protest/history/storage.py b/protest/history/storage.py index 3797335..7903649 100644 --- a/protest/history/storage.py +++ b/protest/history/storage.py @@ -107,17 +107,15 @@ def load_history( entry = json.loads(line) except json.JSONDecodeError: continue - if _is_future_schema(entry): - continue - if evals_only and not _has_suite_kind(entry, "eval"): - continue - if tests_only and not _has_suite_kind(entry, "test"): - continue - if model and (entry.get("evals") or {}).get("model") != model: - continue - if suite and suite not in entry.get("suites", {}): - continue - entries.append(entry) + filtered = _apply_entry_filters( + entry, + evals_only=evals_only, + tests_only=tests_only, + model=model, + suite=suite, + ) + if filtered is not None: + entries.append(filtered) entries.sort(key=lambda e: e.get("timestamp", "")) if n is not None: @@ -125,6 +123,43 @@ def load_history( return entries +def _apply_entry_filters( + entry: dict[str, Any], + *, + evals_only: bool, + tests_only: bool, + model: str | None, + suite: str | None, +) -> dict[str, Any] | None: + """Apply CLI filters to a single history entry. + + Returns the (possibly suite-pruned) entry to keep, or None to drop it. + `--model` / `--suite` operate at the suite level: any suite in the run + that matches keeps the entry alive, with non-matching suites pruned out. + """ + if _is_future_schema(entry): + return None + if evals_only and not _has_suite_kind(entry, "eval"): + return None + if tests_only and not _has_suite_kind(entry, "test"): + return None + if model is None and suite is None: + return entry + + kept_suites: dict[str, Any] = {} + for sname, sdata in entry.get("suites", {}).items(): + if not isinstance(sdata, dict): + continue + if model is not None and sdata.get("model") != model: + continue + if suite is not None and sname != suite: + continue + kept_suites[sname] = sdata + if not kept_suites: + return None + return {**entry, "suites": kept_suites} + + def _has_suite_kind(entry: dict[str, Any], kind: str) -> bool: """Check if entry has at least one suite with the given kind.""" suites = entry.get("suites", {}) @@ -177,6 +212,47 @@ def load_previous_run( return None +def _current_git_head() -> str | None: + """Return the current HEAD short SHA, or None when not in a git repo.""" + try: + return subprocess.run( + ["git", "rev-parse", "HEAD"], # noqa: S607 + capture_output=True, + text=True, + timeout=5, + check=True, + ).stdout.strip() + except (FileNotFoundError, subprocess.CalledProcessError): + return None + + +def is_dirty_entry(entry: dict[str, Any], current_commit: str | None) -> bool: + """Return True if `entry` was produced on a dirty working tree at HEAD.""" + if not current_commit: + return False + git = entry.get("git") or {} + return bool(git.get("dirty")) and git.get("commit") == current_commit + + +def count_dirty_entries(history_dir: Path | None = None) -> int: + """Count entries `clean_dirty()` would remove (without touching the file).""" + path = (history_dir or DEFAULT_HISTORY_DIR) / HISTORY_FILE + if not path.exists(): + return 0 + current_commit = _current_git_head() + if not current_commit: + return 0 + count = 0 + for line in path.read_text().strip().splitlines(): + try: + entry = json.loads(line) + except json.JSONDecodeError: + continue + if is_dirty_entry(entry, current_commit): + count += 1 + return count + + def clean_dirty(history_dir: Path | None = None) -> int: """Remove entries where git.dirty=True AND git.commit matches current HEAD. @@ -190,15 +266,8 @@ def clean_dirty(history_dir: Path | None = None) -> int: if not path.exists(): return 0 - try: - current_commit = subprocess.run( - ["git", "rev-parse", "HEAD"], # noqa: S607 - capture_output=True, - text=True, - timeout=5, - check=True, - ).stdout.strip() - except (FileNotFoundError, subprocess.CalledProcessError): + current_commit = _current_git_head() + if not current_commit: return 0 with open(path, "r+") as f, _exclusive_file_lock(f): @@ -213,8 +282,7 @@ def clean_dirty(history_dir: Path | None = None) -> int: except json.JSONDecodeError: kept.append(line) continue - git = entry.get("git") or {} - if git.get("dirty") and git.get("commit") == current_commit: + if is_dirty_entry(entry, current_commit): removed += 1 else: kept.append(line) diff --git a/protest/reporting/ascii.py b/protest/reporting/ascii.py index 446c083..a7dfdea 100644 --- a/protest/reporting/ascii.py +++ b/protest/reporting/ascii.py @@ -58,8 +58,12 @@ def _format_test_name(result: TestResult, include_suite: bool = False) -> str: return name -def _format_eval_scores_inline(result: TestResult) -> str: - """Format eval scores for inline display — ASCII version (no glyphs).""" +def _format_eval_scores_inline(result: TestResult, short: bool = False) -> str: + """Format eval scores for inline display — ASCII version (no glyphs). + + When `short=True`, only failing/skipped scores are shown — passing scores + are hidden to keep the output readable on large suites. + """ if not result.eval_payload: return "" parts: list[str] = [] @@ -67,6 +71,8 @@ def _format_eval_scores_inline(result: TestResult) -> str: if entry.skipped: parts.append(f"{name}=skip") continue + if short and entry.passed: + continue val = entry.value if isinstance(val, bool): parts.append(f"{name}={'pass' if val else 'fail'}") @@ -88,10 +94,12 @@ def __init__( verbosity: int = 0, show_logs: str | None = None, show_output: bool = False, + short: bool = False, ) -> None: self._verbosity = verbosity self._show_logs = show_logs self._show_output = show_output + self._short = short self._is_parallel = False self._failed_results: list[TestResult] = [] self._error_results: list[TestResult] = [] @@ -107,6 +115,7 @@ def activate(cls, ctx: PluginContext) -> Self | None: verbosity=ctx.get("verbosity", 0), show_logs=ctx.get("show_logs"), show_output=ctx.get("show_output", False), + short=ctx.get("short", False), ) return None @@ -223,7 +232,11 @@ def on_test_pass(self, result: TestResult) -> None: retry_suffix = "" if result.max_attempts > 1: retry_suffix = f" [attempt {result.attempt}/{result.max_attempts}]" - scores_str = _format_eval_scores_inline(result) if result.is_eval else "" + scores_str = ( + _format_eval_scores_inline(result, short=self._short) + if result.is_eval + else "" + ) print(f" OK {name} ({duration}){scores_str}{retry_suffix}") if self._show_output and result.is_eval: self._print_eval_detail(result) diff --git a/protest/reporting/rich_reporter.py b/protest/reporting/rich_reporter.py index 22622c6..57ab433 100644 --- a/protest/reporting/rich_reporter.py +++ b/protest/reporting/rich_reporter.py @@ -61,8 +61,12 @@ def _format_test_name(result: TestResult) -> str: return label.replace("[", "\\[") -def _format_eval_scores_inline(result: TestResult) -> str: - """Format eval scores for inline display (e.g. ' bg_score=0.8 char_id=1.0').""" +def _format_eval_scores_inline(result: TestResult, short: bool = False) -> str: + """Format eval scores for inline display (e.g. ' bg_score=0.8 char_id=1.0'). + + When `short=True`, only failing/skipped scores are shown — passing scores + are hidden to keep the output readable on large suites. + """ if not result.eval_payload: return "" parts = [] @@ -70,6 +74,8 @@ def _format_eval_scores_inline(result: TestResult) -> str: if entry.skipped: parts.append(f"{name}=⊘") continue + if short and entry.passed: + continue val = entry.value if isinstance(val, bool): parts.append(f"{name}={'✓' if val else '✗'}") @@ -91,6 +97,7 @@ def __init__( verbosity: int = 0, show_logs: str | None = None, show_output: bool = False, + short: bool = False, ) -> None: from rich.console import Console # noqa: PLC0415 — optional dep, lazy @@ -98,6 +105,7 @@ def __init__( self._verbosity = verbosity self._show_logs = show_logs self._show_output = show_output + self._short = short self._failed_results: list[TestResult] = [] self._error_results: list[TestResult] = [] @@ -129,6 +137,7 @@ def activate(cls, ctx: PluginContext) -> Self | None: verbosity=ctx.get("verbosity", 0), show_logs=ctx.get("show_logs"), show_output=ctx.get("show_output", False), + short=ctx.get("short", False), ) def _print(self, message: str) -> None: @@ -265,7 +274,11 @@ def on_test_pass(self, result: TestResult) -> None: retry_suffix = ( f" [dim]\\[attempt {result.attempt}/{result.max_attempts}][/]" ) - scores_str = _format_eval_scores_inline(result) if result.is_eval else "" + scores_str = ( + _format_eval_scores_inline(result, short=self._short) + if result.is_eval + else "" + ) self._print( f" [green]✓[/] {name} [dim]({duration})[/]{scores_str}{retry_suffix}" ) diff --git a/tests/test_history_cli.py b/tests/test_history_cli.py index b19e5cb..e5f6654 100644 --- a/tests/test_history_cli.py +++ b/tests/test_history_cli.py @@ -1,12 +1,12 @@ """Tests for `protest history` CLI argument parsing. -Covers mutually-exclusive flag groups: -- Action: `--runs` / `--show` / `--compare` -- Kind: `--evals` / `--tests` +The CLI uses sub-commands (`list`, `runs`, `show`, `compare`, `clean`). +`list` is the implicit default when no sub-command is given. Each sub-command +shares a common filter parser (`--tail`, `--model`, `--suite`, `--evals`/ +`--tests`, `--path`); `--evals` and `--tests` remain mutually exclusive. -`handle_history_command(argv)` triggers `SystemExit(2)` from argparse when a -mutex is violated. Tests assert both the exit code and the stderr message -mentioning the conflicting flag. +`handle_history_command(argv)` triggers `SystemExit(2)` from argparse on a +parsing error, and `SystemExit(0)` on a clean (possibly empty-history) run. """ from __future__ import annotations @@ -22,24 +22,19 @@ from pathlib import Path -class TestActionMutex: - """`--runs`, `--show`, `--compare` cannot be combined.""" +class TestKindMutex: + """`--evals` and `--tests` cannot be combined within a sub-command.""" @pytest.mark.parametrize( - ("argv", "expected_flag"), + "argv", [ - (["--runs", "--compare"], "--compare"), - (["--compare", "--runs"], "--runs"), - (["--runs", "--show", "0"], "--show"), - (["--show", "0", "--runs"], "--runs"), - (["--show", "1", "--compare"], "--compare"), - (["--compare", "--show", "1"], "--show"), + ["list", "--evals", "--tests"], + ["runs", "--tests", "--evals"], ], ) def test_mutex_violation_exits_with_error( self, argv: list[str], - expected_flag: str, capsys: pytest.CaptureFixture[str], ) -> None: with pytest.raises(SystemExit) as exc_info: @@ -47,74 +42,76 @@ def test_mutex_violation_exits_with_error( assert exc_info.value.code == 2 stderr = capsys.readouterr().err assert "not allowed with argument" in stderr - assert expected_flag in stderr -class TestKindMutex: - """`--evals` and `--tests` cannot be combined.""" +class TestSubcommandsAccepted: + """Each sub-command parses cleanly with shared filters.""" @pytest.mark.parametrize( "argv", [ - ["--evals", "--tests"], - ["--tests", "--evals"], + ["list"], + ["runs"], + ["show"], + ["show", "0"], + ["compare"], + ["clean"], + ["list", "--evals"], + ["list", "--tests"], + ["runs", "--tail", "5"], + ["show", "1", "--model", "gpt-4"], + ["compare", "--suite", "my_suite"], ], ) - def test_mutex_violation_exits_with_error( + def test_subcommand_parses_with_empty_history( self, argv: list[str], + tmp_path: Path, capsys: pytest.CaptureFixture[str], ) -> None: + full_argv = [*argv, "--path", str(tmp_path)] with pytest.raises(SystemExit) as exc_info: - handle_history_command(argv) - assert exc_info.value.code == 2 - stderr = capsys.readouterr().err - assert "not allowed with argument" in stderr + handle_history_command(full_argv) + # Empty history exits 0 with "No history found." (or similar). + assert exc_info.value.code == 0 + captured = capsys.readouterr() + assert "not allowed with argument" not in captured.err -class TestMutexIndependence: - """Flags from different groups can be combined freely.""" +class TestImplicitListDefault: + """`protest history` with no sub-command falls back to `list`.""" - @pytest.mark.parametrize( - "action_flags", - [ - ["--runs"], - ["--compare"], - ["--show", "0"], - ], - ) - @pytest.mark.parametrize("kind_flag", ["--evals", "--tests"]) - def test_cross_group_combinations_parse_cleanly( - self, - action_flags: list[str], - kind_flag: str, - tmp_path: Path, - capsys: pytest.CaptureFixture[str], + def test_no_subcommand_runs_list( + self, tmp_path: Path, capsys: pytest.CaptureFixture[str] ) -> None: - argv = [*action_flags, kind_flag, "--path", str(tmp_path)] with pytest.raises(SystemExit) as exc_info: - handle_history_command(argv) + handle_history_command(["--path", str(tmp_path)]) assert exc_info.value.code == 0 - captured = capsys.readouterr() - assert "not allowed with argument" not in captured.err + def test_no_subcommand_with_only_filter_runs_list( + self, tmp_path: Path, capsys: pytest.CaptureFixture[str] + ) -> None: + # `protest history --tail 5 --path X` should be parsed as the + # implicit `list --tail 5 --path X`, not as a parser error. + with pytest.raises(SystemExit) as exc_info: + handle_history_command(["--tail", "5", "--path", str(tmp_path)]) + assert exc_info.value.code == 0 -class TestHelpShowsMutex: - """`--help` output surfaces both mutex groups in usage line.""" - def test_help_output_shows_action_and_kind_groups( - self, capsys: pytest.CaptureFixture[str] - ) -> None: +class TestHelpOutput: + """`--help` lists the sub-commands.""" + + def test_help_lists_subcommands(self, capsys: pytest.CaptureFixture[str]) -> None: with pytest.raises(SystemExit) as exc_info: handle_history_command(["--help"]) assert exc_info.value.code == 0 stdout = capsys.readouterr().out - assert "[--runs | --show [N] | --compare]" in stdout - assert "[--evals | --tests]" in stdout + for cmd in ("list", "runs", "show", "compare", "clean"): + assert cmd in stdout class TestRunsOrderRecentFirst: - """`--runs` lists most-recent run first (git log convention). + """`runs` lists most-recent run first (git log convention). Storage returns entries oldest→newest; the CLI must reverse for display so #1 maps to the newest run, matching `git stash list` / `git log`. @@ -148,7 +145,7 @@ def test_runs_displays_newest_first( ("2026-04-25T12:00:00", "newabcd"), ], ) - handle_history_command(["--runs", "--path", str(tmp_path)]) + handle_history_command(["runs", "--path", str(tmp_path)]) stdout = capsys.readouterr().out # #1 is newest, #3 is oldest. assert stdout.index("#1") < stdout.index("#2") < stdout.index("#3") @@ -158,3 +155,27 @@ def test_runs_displays_newest_first( # And #1 lines up with the newest commit, not the oldest. newest_line = next(line for line in stdout.splitlines() if "#1" in line) assert "newabcd" in newest_line + + +class TestCleanDryRun: + """`clean` is dry-run by default; `--apply` to actually modify the file.""" + + def test_clean_default_is_dry_run( + self, tmp_path: Path, capsys: pytest.CaptureFixture[str] + ) -> None: + # Empty history is the simplest case — both modes should report + # "No dirty entries to clean." without touching anything. + with pytest.raises(SystemExit) as exc_info: + handle_history_command(["clean", "--path", str(tmp_path)]) + assert exc_info.value.code == 0 + out = capsys.readouterr().out + assert "No dirty entries to clean." in out + + def test_clean_apply_flag_accepted( + self, tmp_path: Path, capsys: pytest.CaptureFixture[str] + ) -> None: + with pytest.raises(SystemExit) as exc_info: + handle_history_command(["clean", "--apply", "--path", str(tmp_path)]) + assert exc_info.value.code == 0 + out = capsys.readouterr().out + assert "No dirty entries to clean." in out From db671a6ca0ecd8d9e1131185678f0d4f862137f4 Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Tue, 28 Apr 2026 07:12:37 +0200 Subject: [PATCH 57/60] fix(history): refuse cross-model compare to avoid phantom regressions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `protest history compare` previously aplatted cases across all suites in the two most recent runs. When the runs contained suites under different ModelLabels (e.g. rules_v1 + rules_v2 in a multi-model session), a case-id present under both models would surface as "regressed" or "fixed" depending on which suite the diff happened to scan first. Reported by the v3 naive-agent test: 5 strictly-identical runs produced fake "Regressions: T010, T016" because T010 passed under v2 and failed under v1 — the diff conflated the two contexts. Fix: detect distinct ModelLabel.names across the two compared entries and refuse to run when more than one is present, asking the user to disambiguate via --model NAME or --suite NAME (which already suite-prune entries at load time, leaving a single-model comparison). Two new tests cover the rejection and the --model-disambiguated success path. Top-level `protest --help` epilog and the test-bed MISSION.md also get a small refresh to use the new sub-command syntax (`protest history compare/runs/clean`) rather than the now-removed flag-as-mode form. --- protest/cli/history.py | 25 +++++++++++++++ protest/cli/main.py | 5 +-- tests/test_history_cli.py | 66 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 94 insertions(+), 2 deletions(-) diff --git a/protest/cli/history.py b/protest/cli/history.py index 50cf34d..563dd9c 100644 --- a/protest/cli/history.py +++ b/protest/cli/history.py @@ -130,6 +130,19 @@ def _dispatch_view(action: str, nth: int, entries: list[dict[str, Any]]) -> None if len(entries) < 2: print("Need at least 2 runs to compare.") sys.exit(1) + # Refuse to compare across multiple models silently. When two runs + # contain suites with several distinct model labels (e.g. rules_v1 + # and rules_v2 in the same multi-model session), the case-name diff + # would conflate the two contexts and emit phantom regressions. + # Force the user to disambiguate via --model NAME or --suite NAME. + models = _models_in_entries([entries[-1], entries[-2]]) + if len(models) > 1: + print( + "Cannot compare runs that contain multiple models: " + f"{sorted(models)}. Pass --model NAME to compare runs of " + "the same model, or --suite NAME to focus on one suite." + ) + sys.exit(1) out.compare(entries[-1], entries[-2]) elif action == "show": if nth >= len(entries): @@ -142,6 +155,18 @@ def _dispatch_view(action: str, nth: int, entries: list[dict[str, Any]]) -> None out.stats(entries) +def _models_in_entries(entries: list[dict[str, Any]]) -> set[str]: + """Collect distinct, non-empty model labels across the given entries.""" + models: set[str] = set() + for entry in entries: + for sdata in entry.get("suites", {}).values(): + if isinstance(sdata, dict): + model = sdata.get("model") + if model: + models.add(model) + return models + + # --------------------------------------------------------------------------- # Output abstraction — Rich if available, plain text fallback # --------------------------------------------------------------------------- diff --git a/protest/cli/main.py b/protest/cli/main.py index 4aaab5f..2fcc5b1 100644 --- a/protest/cli/main.py +++ b/protest/cli/main.py @@ -27,8 +27,9 @@ protest eval demo:session Run all evaluations protest eval demo:session --show-output Show inputs/output/expected per case protest history --evals Show eval suite trends - protest history --evals --tail 5 Show last 5 entries - protest history --evals --compare Compare 2 most recent runs + protest history runs --evals Run-by-run breakdown + protest history compare --evals Compare 2 most recent runs + protest history clean Preview removable dirty entries protest live Start live reporter server protest tags list demo:session List all available tags """ diff --git a/tests/test_history_cli.py b/tests/test_history_cli.py index e5f6654..d8b9f3c 100644 --- a/tests/test_history_cli.py +++ b/tests/test_history_cli.py @@ -157,6 +157,72 @@ def test_runs_displays_newest_first( assert "newabcd" in newest_line +class TestCompareRefusesMixedModels: + """`compare` must not silently diff across models — would cause false regressions. + + When the two most recent runs each contain suites with several distinct + `ModelLabel.name`s (e.g. `rules_v1` + `rules_v2` in a multi-model + session), aplatting the cases by name conflates contexts: a case-id that + passes under one model and fails under the other shows up as a phantom + regression. The CLI rejects this and asks the user to disambiguate via + `--model NAME` or `--suite NAME`. + """ + + def _seed_two_model_run(self, tmp_path: Path, run_id: str, ts: str) -> None: + path = tmp_path / HISTORY_FILE + append_entry( + path, + { + "schema_version": 1, + "run_id": run_id, + "timestamp": ts, + "git": {"commit_short": run_id}, + "suites": { + "helpdesk_v1": { + "kind": "eval", + "model": "rules_v1", + "passed": 9, + "total_cases": 18, + "cases": {"T010": {"passed": False, "case_hash": "h1"}}, + }, + "helpdesk_v2": { + "kind": "eval", + "model": "rules_v2", + "passed": 11, + "total_cases": 18, + "cases": {"T010": {"passed": True, "case_hash": "h1"}}, + }, + }, + }, + ) + + def test_compare_rejects_mixed_models_without_filter( + self, tmp_path: Path, capsys: pytest.CaptureFixture[str] + ) -> None: + self._seed_two_model_run(tmp_path, "aaa1111", "2026-04-27T10:00:00") + self._seed_two_model_run(tmp_path, "bbb2222", "2026-04-27T11:00:00") + with pytest.raises(SystemExit) as exc_info: + handle_history_command(["compare", "--evals", "--path", str(tmp_path)]) + assert exc_info.value.code == 1 + out = capsys.readouterr().out + assert "multiple models" in out + assert "rules_v1" in out and "rules_v2" in out + assert "--model" in out + + def test_compare_with_model_filter_succeeds( + self, tmp_path: Path, capsys: pytest.CaptureFixture[str] + ) -> None: + self._seed_two_model_run(tmp_path, "aaa1111", "2026-04-27T10:00:00") + self._seed_two_model_run(tmp_path, "bbb2222", "2026-04-27T11:00:00") + # `--model rules_v1` prunes helpdesk_v2 out of each entry, leaving + # a single-model comparison that should succeed (no false regression). + handle_history_command( + ["compare", "--evals", "--model", "rules_v1", "--path", str(tmp_path)] + ) + out = capsys.readouterr().out + assert "multiple models" not in out + + class TestCleanDryRun: """`clean` is dry-run by default; `--apply` to actually modify the file.""" From 37d5c09882f39a41ccd7b87fff54582715454ae7 Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Tue, 28 Apr 2026 12:46:59 +0200 Subject: [PATCH 58/60] refactor(evals): split Evaluator __call__/run, require @evaluator at registration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The single Evaluator.__call__ that switched on isinstance(args[0], EvalContext) forced an Any-typed signature and produced the surprising f is f() identity for the no-kwargs case. Split into __call__(**kwargs) for rebinding and run(ctx) for execution: each method is monomorphic and pyright can read it without overloads. Plain callables are no longer accepted in evaluators=[...]. validate_evaluators runs at registration boundaries (make_eval_wrapper, EvalCase, ShortCircuit) and raises a clear TypeError pointing at @evaluator. The executor then operates on a uniform Evaluator | ShortCircuit Union — the only remaining isinstance is the narrowing on that real disjoint Union. --- docs/evals.md | 2 +- protest/evals/evaluator.py | 90 +++++++++++++++++++----- protest/evals/suite.py | 10 ++- protest/evals/wrapper.py | 36 +++++----- tests/evals/test_e2e.py | 60 ++++++++-------- tests/evals/test_evaluator_validation.py | 57 +++++++++++++++ 6 files changed, 185 insertions(+), 70 deletions(-) create mode 100644 tests/evals/test_evaluator_validation.py diff --git a/docs/evals.md b/docs/evals.md index f1bb40e..2831f8f 100644 --- a/docs/evals.md +++ b/docs/evals.md @@ -146,7 +146,7 @@ protest eval evals.session:session --no-tag slow ## Evaluators -An evaluator is a function decorated with `@evaluator` that receives an `EvalContext` and returns a verdict. +An evaluator is a function decorated with `@evaluator` that receives an `EvalContext` and returns a verdict. The decorator is mandatory: passing a plain function in `evaluators=[...]` raises `TypeError` at registration. The wrapping is what gives the evaluator its identity (used for hashing, history, reporting) and a typed `run(ctx)` method — there's no implicit conversion. !!! info "If your eval task returns a non-string output" diff --git a/protest/evals/evaluator.py b/protest/evals/evaluator.py index b493967..8f3927d 100644 --- a/protest/evals/evaluator.py +++ b/protest/evals/evaluator.py @@ -1,22 +1,29 @@ -"""Evaluator primitives — functions, not classes. +"""Evaluator primitives. -An evaluator is a callable that receives an EvalContext and returns a score. -The @evaluator decorator adds partial-application ergonomics: +An evaluator is a function decorated with ``@evaluator`` that receives an +``EvalContext`` and returns a verdict. The decorator wraps the function in an +``Evaluator`` instance that carries identity (for hashing/history) and exposes +two distinct entry points: + +- ``ev(keyword=value, ...)`` — bind params, return a new ``Evaluator`` +- ``ev.run(ctx)`` — execute against an ``EvalContext`` (called by the framework) + +Plain callables are not accepted in ``evaluators=[...]``; use ``@evaluator``:: @evaluator def contains_keywords(ctx: EvalContext, keywords: list[str]) -> ContainsKeywordsResult: found = sum(1 for k in keywords if k.lower() in ctx.output.lower()) return ContainsKeywordsResult(keyword_recall=found / len(keywords), ...) - # Bind params → returns a callable(ctx) via functools.partial + # Bind params → returns a fresh Evaluator with kwargs frozen in. evaluators=[contains_keywords(keywords=["paris", "france"])] - # No params → use directly + # No params → use the bare Evaluator directly. @evaluator def not_empty(ctx: EvalContext) -> bool: return bool(ctx.output.strip()) -Async evaluators are supported: +Async evaluators are supported:: @evaluator async def llm_judge(ctx: EvalContext, model: str = "haiku") -> bool: @@ -155,6 +162,7 @@ def __post_init__(self) -> None: "EvalCase.name must be a non-empty string " "(used for history tracking and case identity)." ) + validate_evaluators(self.evaluators) def __repr__(self) -> str: return self.name @@ -177,7 +185,8 @@ class ShortCircuit: ] """ - def __init__(self, evaluators: list[Any]) -> None: + def __init__(self, evaluators: list[Evaluator]) -> None: + validate_evaluators(evaluators, _inside_short_circuit=True) self.evaluators = evaluators def evaluator_identity(self) -> dict[str, Any]: @@ -185,6 +194,40 @@ def evaluator_identity(self) -> dict[str, Any]: return {"short_circuit": [_canonical(e) for e in self.evaluators]} +def validate_evaluators( + items: list[Any], *, _inside_short_circuit: bool = False +) -> None: + """Reject anything that isn't a registered Evaluator (or ShortCircuit). + + ``@evaluator`` is the only sanctioned path to producing an evaluator. Plain + callables used to be accepted, which forced a runtime ``isinstance`` dispatch + in the executor and made the evaluators list type effectively ``list[Any]``. + Failing loud at registration moves the error to the boundary and lets + downstream code work on a uniform ``Evaluator | ShortCircuit`` Union. + """ + for item in items: + if isinstance(item, Evaluator): + continue + if isinstance(item, ShortCircuit) and not _inside_short_circuit: + continue + if _inside_short_circuit and isinstance(item, ShortCircuit): + raise TypeError( + "ShortCircuit cannot nest another ShortCircuit; " + "flatten the inner evaluators into the outer group." + ) + if callable(item): + raise TypeError( + f"{item!r} is a plain callable, not an Evaluator. " + "Wrap it with @evaluator (from protest.evals) so it carries " + "identity, hashing, and a typed run() method." + ) + raise TypeError( + f"Expected Evaluator or ShortCircuit, got {type(item).__name__}. " + "Only objects produced by @evaluator (or ShortCircuit groups) " + "are accepted in evaluators=[...]." + ) + + class Metric: """Annotate a float/int field as a metric for stats aggregation.""" @@ -232,10 +275,14 @@ def extract_scores_from_result(result: Any, evaluator_name: str) -> list[Any]: class Evaluator: """A configured evaluator — callable with identity for hashing. - Created by the ``@evaluator`` decorator. Supports two calling modes: + Created by the ``@evaluator`` decorator. Two distinct entry points: + + - ``ev(keyword=value, ...)`` — bind params, return a new Evaluator + - ``ev.run(ctx)`` — execute against an EvalContext - 1. ``ev(ctx)`` — evaluate directly (first arg is EvalContext) - 2. ``ev(keyword=value, ...)`` — bind params, return a new Evaluator + Splitting these avoids the "callable that does two things based on the + type of arg[0]" anti-pattern: each method has a single, monomorphic + signature that type checkers can read without overload gymnastics. """ __slots__ = ("_fn", "_kwargs", "_name", "_qualname") @@ -252,16 +299,15 @@ def __init__( def name(self) -> str: return self._name - def __call__(self, *args: Any, **kwargs: Any) -> Any: - if args and isinstance(args[0], EvalContext): - merged = {**self._kwargs, **kwargs} - return self._fn(*args, **merged) - # Re-binding form (no EvalContext): always returns a fresh clone. - # Returning `self` for the no-kwargs case used to make `f is f()` - # accidentally true, which surprised users expecting `()` to behave - # like an evaluator constructor. + def __call__(self, **kwargs: Any) -> Evaluator: + # Re-binding form: always returns a fresh clone. Returning `self` + # for the no-kwargs case used to make `f is f()` accidentally true, + # which surprised users expecting `()` to behave like a constructor. return Evaluator(self._fn, {**self._kwargs, **kwargs}) + def run(self, ctx: EvalContext[Any, Any], /) -> Any: + return self._fn(ctx, **self._kwargs) + def evaluator_identity(self) -> dict[str, Any]: identity: dict[str, Any] = {"fn": self._qualname} if self._kwargs: @@ -276,5 +322,11 @@ def __repr__(self) -> str: def evaluator(fn: Callable[..., Any]) -> Evaluator: - """Turn a function into a ProTest evaluator.""" + """Turn a function into a ProTest evaluator. + + The decorator is the only sanctioned way to produce an object that + ``evaluators=[...]`` will accept. Plain callables are rejected at + registration so the executor can rely on a uniform Union type instead + of dispatching at runtime. + """ return Evaluator(fn) diff --git a/protest/evals/suite.py b/protest/evals/suite.py index 4971e17..67e277c 100644 --- a/protest/evals/suite.py +++ b/protest/evals/suite.py @@ -9,8 +9,9 @@ from protest.evals.wrapper import make_eval_wrapper if TYPE_CHECKING: - from collections.abc import Callable + from collections.abc import Callable, Sequence + from protest.evals.evaluator import Evaluator, ShortCircuit from protest.evals.types import Judge, ModelLabel FuncT = TypeVar("FuncT", bound="Callable[..., object]") @@ -65,7 +66,7 @@ def model(self) -> ModelLabel | None: def eval( self, - evaluators: list[Any] | None = None, + evaluators: Sequence[Evaluator | ShortCircuit] | None = None, tags: list[str] | None = None, timeout: float | None = None, judge: Judge | None = None, @@ -83,9 +84,12 @@ def eval( def decorator(func: FuncT) -> FuncT: resolved_judge = judge or self._judge + evals_list: list[Evaluator | ShortCircuit] = ( + list(evaluators) if evaluators else [] + ) wrapper = make_eval_wrapper( func, - evaluators or [], + evals_list, judge=resolved_judge, ) self.test(tags=tags, timeout=timeout, is_eval=True)(wrapper) diff --git a/protest/evals/wrapper.py b/protest/evals/wrapper.py index 9601a7c..3f07cc3 100644 --- a/protest/evals/wrapper.py +++ b/protest/evals/wrapper.py @@ -20,6 +20,7 @@ Evaluator, ShortCircuit, extract_scores_from_result, + validate_evaluators, ) from protest.evals.hashing import compute_case_hash, compute_eval_hash from protest.evals.types import EvalScore, TaskResult @@ -32,12 +33,13 @@ def make_eval_wrapper( func: Any, - evaluators: list[Any], + evaluators: list[Evaluator | ShortCircuit], judge: Any = None, ) -> Any: """Wrap a function to run evaluators on its return value.""" _validate_single_evalcase_param(func) + validate_evaluators(evaluators) @functools.wraps(func) async def eval_wrapper(**kwargs: Any) -> EvalPayload: @@ -207,7 +209,7 @@ def _extract_per_case_evaluators(kwargs: dict[str, Any]) -> list[Any]: async def run_evaluators( - evaluators: list[Any], + evaluators: list[Evaluator | ShortCircuit], case_name: str, inputs: Any, output: Any, @@ -216,7 +218,12 @@ async def run_evaluators( duration: float, judge: Any = None, ) -> tuple[list[EvalScore], EvalContext[Any, Any]]: - """Run evaluators and return (scores, ctx with judge stats).""" + """Run evaluators and return (scores, ctx with judge stats). + + Callers must have validated the list (Evaluator | ShortCircuit only) at the + boundary; the loop below trusts the Union and uses isinstance solely to + narrow it — the only legitimate isinstance kept in this module. + """ ctx = EvalContext( name=case_name, inputs=inputs, @@ -233,40 +240,35 @@ async def run_evaluators( scores.extend(await _run_short_circuit(ev.evaluators, ctx)) continue - evaluator_name = ev.name if isinstance(ev, Evaluator) else type(ev).__name__ try: - raw = ev(ctx) + raw = ev.run(ctx) result = await raw if asyncio.iscoroutine(raw) else raw - scores.extend(extract_scores_from_result(result, evaluator_name)) + scores.extend(extract_scores_from_result(result, ev.name)) except Exception as exc: - raise FixtureError(f"evaluator '{evaluator_name}'", exc) from exc + raise FixtureError(f"evaluator '{ev.name}'", exc) from exc return scores, ctx async def _run_short_circuit( - evaluators: list[Any], + evaluators: list[Evaluator], ctx: EvalContext[Any, Any], ) -> list[EvalScore]: """Run evaluators in order, stop at first Verdict=False.""" scores: list[EvalScore] = [] for i, ev in enumerate(evaluators): - evaluator_name = ev.name if isinstance(ev, Evaluator) else type(ev).__name__ try: - raw = ev(ctx) + raw = ev.run(ctx) result = await raw if asyncio.iscoroutine(raw) else raw except Exception as exc: - raise FixtureError(f"evaluator '{evaluator_name}'", exc) from exc - extracted = extract_scores_from_result(result, evaluator_name) + raise FixtureError(f"evaluator '{ev.name}'", exc) from exc + extracted = extract_scores_from_result(result, ev.name) scores.extend(extracted) if any(s.is_verdict and not s.passed for s in extracted): # Mark remaining evaluators as skipped for skipped_ev in evaluators[i + 1 :]: - skipped_name = ( - skipped_ev.name - if isinstance(skipped_ev, Evaluator) - else type(skipped_ev).__name__ + scores.append( + EvalScore(name=skipped_ev.name, value=False, skipped=True) ) - scores.append(EvalScore(name=skipped_name, value=False, skipped=True)) break return scores diff --git a/tests/evals/test_e2e.py b/tests/evals/test_e2e.py index 7daf058..75def3c 100644 --- a/tests/evals/test_e2e.py +++ b/tests/evals/test_e2e.py @@ -717,14 +717,14 @@ def _make_ctx(self, output: str, expected: str | None = None) -> EvalContext: def test_contains_keywords(self) -> None: e = contains_keywords(keywords=["hello", "world"]) - result = e(self._make_ctx("Hello World")) + result = e.run(self._make_ctx("Hello World")) assert result.keyword_recall == 1.0 assert result.all_keywords_present is True def test_contains_keywords_default_requires_all(self) -> None: """Default `min_recall=1.0` means strict: missing one → verdict False.""" e = contains_keywords(keywords=["hello", "world"]) - result = e(self._make_ctx("Only hello here")) + result = e.run(self._make_ctx("Only hello here")) assert result.keyword_recall == 0.5 assert result.all_keywords_present is False @@ -736,38 +736,38 @@ def test_contains_keywords_threshold_continuity_at_zero(self) -> None: Now `recall >= min_recall` applies uniformly. """ e = contains_keywords(keywords=["alpha", "beta"], min_recall=0.0) - result = e(self._make_ctx("nothing matches")) + result = e.run(self._make_ctx("nothing matches")) assert result.keyword_recall == 0.0 assert result.all_keywords_present is True def test_contains_keywords_threshold_at_exact_value(self) -> None: """Verdict passes when recall equals the threshold exactly.""" e = contains_keywords(keywords=["alpha", "beta"], min_recall=0.5) - result = e(self._make_ctx("only alpha here")) + result = e.run(self._make_ctx("only alpha here")) assert result.keyword_recall == 0.5 assert result.all_keywords_present is True def test_contains_keywords_threshold_just_below(self) -> None: """Verdict fails when recall is below the threshold.""" e = contains_keywords(keywords=["alpha", "beta", "gamma"], min_recall=0.5) - result = e(self._make_ctx("only alpha")) + result = e.run(self._make_ctx("only alpha")) assert abs(result.keyword_recall - 1 / 3) < 1e-9 assert result.all_keywords_present is False def test_contains_expected(self) -> None: e = contains_expected - assert e(self._make_ctx("Hello World", "world")) is True - assert e(self._make_ctx("Hello", "world")) is False + assert e.run(self._make_ctx("Hello World", "world")) is True + assert e.run(self._make_ctx("Hello", "world")) is False def test_does_not_contain(self) -> None: e = does_not_contain(forbidden=["cat", "dog"]) - assert e(self._make_ctx("Yorkshire")).no_forbidden_words is True - assert e(self._make_ctx("I like cats")).no_forbidden_words is False + assert e.run(self._make_ctx("Yorkshire")).no_forbidden_words is True + assert e.run(self._make_ctx("I like cats")).no_forbidden_words is False def test_not_empty(self) -> None: - assert not_empty(self._make_ctx("hello")) is True - assert not_empty(self._make_ctx("")) is False - assert not_empty(self._make_ctx(" ")) is False + assert not_empty.run(self._make_ctx("hello")) is True + assert not_empty.run(self._make_ctx("")) is False + assert not_empty.run(self._make_ctx(" ")) is False def test_not_empty_handles_sized_containers(self) -> None: """Sized containers: empty -> False, non-empty -> True. @@ -779,63 +779,63 @@ def test_not_empty_handles_sized_containers(self) -> None: # Helper accepts Any at runtime; type hint is just a default. ctx_empty_list: Any = self._make_ctx("") ctx_empty_list.output = [] - assert not_empty(ctx_empty_list) is False + assert not_empty.run(ctx_empty_list) is False ctx_nonempty_list: Any = self._make_ctx("") ctx_nonempty_list.output = [1, 2] - assert not_empty(ctx_nonempty_list) is True + assert not_empty.run(ctx_nonempty_list) is True ctx_empty_dict: Any = self._make_ctx("") ctx_empty_dict.output = {} - assert not_empty(ctx_empty_dict) is False + assert not_empty.run(ctx_empty_dict) is False ctx_nonempty_dict: Any = self._make_ctx("") ctx_nonempty_dict.output = {"a": 1} - assert not_empty(ctx_nonempty_dict) is True + assert not_empty.run(ctx_nonempty_dict) is True ctx_empty_set: Any = self._make_ctx("") ctx_empty_set.output = set() - assert not_empty(ctx_empty_set) is False + assert not_empty.run(ctx_empty_set) is False def test_not_empty_unsized_objects_still_pass(self) -> None: """Non-Sized values (int, float, dataclass): always True (kept as-is).""" ctx_int: Any = self._make_ctx("") ctx_int.output = 42 - assert not_empty(ctx_int) is True + assert not_empty.run(ctx_int) is True ctx_zero: Any = self._make_ctx("") ctx_zero.output = 0 # 0 is not None, not Sized — still passes. - assert not_empty(ctx_zero) is True + assert not_empty.run(ctx_zero) is True def test_max_length(self) -> None: e = max_length(max_chars=5) - result = e(self._make_ctx("hi")) + result = e.run(self._make_ctx("hi")) assert result.within_limit is True - result = e(self._make_ctx("this is too long")) + result = e.run(self._make_ctx("this is too long")) assert result.within_limit is False def test_min_length(self) -> None: - assert min_length(min_chars=3)(self._make_ctx("hello")) is True - assert min_length(min_chars=10)(self._make_ctx("hi")) is False + assert min_length(min_chars=3).run(self._make_ctx("hello")) is True + assert min_length(min_chars=10).run(self._make_ctx("hi")) is False def test_matches_regex(self) -> None: e = matches_regex(pattern=r"\d{3}-\d{4}") - assert e(self._make_ctx("Call 555-1234")) is True - assert e(self._make_ctx("no numbers")) is False + assert e.run(self._make_ctx("Call 555-1234")) is True + assert e.run(self._make_ctx("no numbers")) is False def test_json_valid(self) -> None: e = json_valid(required_keys=["name"]) - result = e(self._make_ctx('{"name": "Rex"}')) + result = e.run(self._make_ctx('{"name": "Rex"}')) assert result.valid_json is True assert result.has_required_keys is True - result = e(self._make_ctx("not json")) + result = e.run(self._make_ctx("not json")) assert result.valid_json is False def test_word_overlap(self) -> None: e = word_overlap - assert e(self._make_ctx("hello world", "hello world")).overlap == 1.0 - assert e(self._make_ctx("hello there", "hello world")).overlap == 0.5 - assert e(self._make_ctx("foo", "hello world")).overlap == 0.0 + assert e.run(self._make_ctx("hello world", "hello world")).overlap == 1.0 + assert e.run(self._make_ctx("hello there", "hello world")).overlap == 0.5 + assert e.run(self._make_ctx("foo", "hello world")).overlap == 0.0 # --------------------------------------------------------------------------- diff --git a/tests/evals/test_evaluator_validation.py b/tests/evals/test_evaluator_validation.py new file mode 100644 index 0000000..584a988 --- /dev/null +++ b/tests/evals/test_evaluator_validation.py @@ -0,0 +1,57 @@ +"""Validation that evaluators=[...] only accepts @evaluator-wrapped objects. + +Plain callables and arbitrary values used to be silently accepted, forcing a +runtime ``isinstance`` dispatch in the executor. Validating at the boundary +turns the failure into a clear TypeError at registration time and lets the +downstream code work on a uniform ``Evaluator | ShortCircuit`` Union. +""" + +from __future__ import annotations + +import pytest + +from protest.evals.evaluator import ( + EvalCase, + EvalContext, + ShortCircuit, + evaluator, + validate_evaluators, +) + + +@evaluator +def _ok(ctx: EvalContext) -> bool: + return True + + +def _plain_callable(ctx: EvalContext) -> bool: + return True + + +class TestValidateEvaluators: + def test_accepts_evaluator(self) -> None: + validate_evaluators([_ok]) + + def test_accepts_short_circuit(self) -> None: + validate_evaluators([ShortCircuit([_ok])]) + + def test_rejects_plain_callable(self) -> None: + with pytest.raises(TypeError, match="@evaluator"): + validate_evaluators([_plain_callable]) + + def test_rejects_non_callable(self) -> None: + with pytest.raises(TypeError, match="Expected Evaluator or ShortCircuit"): + validate_evaluators(["not_an_evaluator"]) # type: ignore[list-item] + + def test_rejects_nested_short_circuit(self) -> None: + with pytest.raises(TypeError, match="cannot nest"): + ShortCircuit([ShortCircuit([_ok])]) # type: ignore[list-item] + + +class TestEvalCaseValidates: + def test_evalcase_rejects_plain_callable(self) -> None: + with pytest.raises(TypeError, match="@evaluator"): + EvalCase(inputs="x", name="c", evaluators=[_plain_callable]) + + def test_evalcase_accepts_evaluator(self) -> None: + EvalCase(inputs="x", name="c", evaluators=[_ok]) From 8e388ca37ac8eccb466f38cc3898a961bdd39749 Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Tue, 28 Apr 2026 22:47:20 +0200 Subject: [PATCH 59/60] fix(evals,history): polish from naive-agent v4 feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - evals.md: EvalCase field table listed `tags` as a special metadata key while the example below used `tags=[...]` as a kwarg and the dataclass declares it first-class. Split into separate `tags` / `metadata` rows. - evals.md: history compare example now shows `--model NAME` with the rationale, so users hit the constraint at read time instead of via the runtime "multiple models" rejection. - history.py: Run Detail panel title now carries a "(+ pass · - fail)" legend; the +/- markers were unlabeled and required inference. --- docs/evals.md | 9 +++++++-- protest/cli/history.py | 6 +++++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/docs/evals.md b/docs/evals.md index 2831f8f..4e22920 100644 --- a/docs/evals.md +++ b/docs/evals.md @@ -119,7 +119,8 @@ cases = ForEach([ | `expected` | `Any` | Expected output (passed to evaluators as `ctx.expected_output`) | | `name` | `str` | Case identifier (used in test IDs and history) | | `evaluators` | `list` | Per-case evaluators (added to suite-level ones) | -| `metadata` | `dict` | Arbitrary metadata (special key: `"tags"` — see below) | +| `tags` | `list[str]` | First-class tags — flow to `protest eval --tag …` (see below) | +| `metadata` | `dict` | Arbitrary metadata, opaque to the framework | ### Why `EvalCase` and not a dict? @@ -663,7 +664,11 @@ protest history --evals --runs protest history --evals --show # Compare last two runs (fixed/regressed/new) -protest history --evals --compare +# Requires --model NAME if your history mixes multiple model labels +# (e.g. one suite per rules version) — comparing across labels is rejected +# to avoid phantom regressions where a case "fails" only because the two +# runs being diffed used different models. +protest history --evals --compare --model rules_v1 ``` ### Integrity Hashes diff --git a/protest/cli/history.py b/protest/cli/history.py index 563dd9c..19b6a97 100644 --- a/protest/cli/history.py +++ b/protest/cli/history.py @@ -364,7 +364,11 @@ def detail(self, entry: dict[str, Any]) -> None: self.console.print() self.console.print( - Panel(lines, title="[bold]Run Detail[/]", border_style="cyan") + Panel( + lines, + title="[bold]Run Detail[/] [dim]([green]+[/] pass · [red]-[/] fail)[/]", + border_style="cyan", + ) ) def compare(self, current: dict[str, Any], previous: dict[str, Any]) -> None: From 99d512f55da6b362c00c2e96f2b0340d6ce2ad18 Mon Sep 17 00:00:00 2001 From: Renaud Cepre <32103211+renaudcepre@users.noreply.github.com> Date: Tue, 28 Apr 2026 23:08:02 +0200 Subject: [PATCH 60/60] refactor(examples): rename yorkshire dataset.py to cases.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Vestige from the pydantic-evals era — there is no Dataset concept in the native eval API. The file holds EvalCase instances, so cases.py matches the vocabulary used by EvalSuite, EvalCase, and the --last-failed CLI flag. --- examples/yorkshire/evals/{dataset.py => cases.py} | 2 +- examples/yorkshire/evals/session.py | 2 +- examples/yorkshire/session.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) rename examples/yorkshire/evals/{dataset.py => cases.py} (98%) diff --git a/examples/yorkshire/evals/dataset.py b/examples/yorkshire/evals/cases.py similarity index 98% rename from examples/yorkshire/evals/dataset.py rename to examples/yorkshire/evals/cases.py index 423ad76..f50eae9 100644 --- a/examples/yorkshire/evals/dataset.py +++ b/examples/yorkshire/evals/cases.py @@ -1,4 +1,4 @@ -"""Dataset for the Yorkshire chatbot evals.""" +"""Eval cases for the Yorkshire chatbot.""" from __future__ import annotations diff --git a/examples/yorkshire/evals/session.py b/examples/yorkshire/evals/session.py index 06d9b3f..e23f1d7 100644 --- a/examples/yorkshire/evals/session.py +++ b/examples/yorkshire/evals/session.py @@ -11,7 +11,7 @@ from typing import Annotated from examples.yorkshire.app.chatbot import yorkshire_chatbot -from examples.yorkshire.evals.dataset import ( +from examples.yorkshire.evals.cases import ( suite_evaluators, yorkshire_cases, ) diff --git a/examples/yorkshire/session.py b/examples/yorkshire/session.py index c4ffeb0..f1347b7 100644 --- a/examples/yorkshire/session.py +++ b/examples/yorkshire/session.py @@ -14,7 +14,7 @@ from typing import Annotated from examples.yorkshire.app.chatbot import yorkshire_chatbot -from examples.yorkshire.evals.dataset import suite_evaluators, yorkshire_cases +from examples.yorkshire.evals.cases import suite_evaluators, yorkshire_cases from examples.yorkshire.tests.fixtures import ( configure_kennel_logging, kennel,